1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 2010-2012, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  stringtriebuilder.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2010dec24
14*   created by: Markus W. Scherer
15*/
16
17#include "utypeinfo.h"  // for 'typeid' to work
18#include "unicode/utypes.h"
19#include "unicode/stringtriebuilder.h"
20#include "uassert.h"
21#include "uhash.h"
22
23U_CDECL_BEGIN
24
25static int32_t U_CALLCONV
26hashStringTrieNode(const UHashTok key) {
27    return icu::StringTrieBuilder::hashNode(key.pointer);
28}
29
30static UBool U_CALLCONV
31equalStringTrieNodes(const UHashTok key1, const UHashTok key2) {
32    return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer);
33}
34
35U_CDECL_END
36
37U_NAMESPACE_BEGIN
38
39StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {}
40
41StringTrieBuilder::~StringTrieBuilder() {
42    deleteCompactBuilder();
43}
44
45void
46StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
47    if(U_FAILURE(errorCode)) {
48        return;
49    }
50    nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL,
51                         sizeGuess, &errorCode);
52    if(U_SUCCESS(errorCode)) {
53        if(nodes==NULL) {
54          errorCode=U_MEMORY_ALLOCATION_ERROR;
55        } else {
56          uhash_setKeyDeleter(nodes, uprv_deleteUObject);
57        }
58    }
59}
60
61void
62StringTrieBuilder::deleteCompactBuilder() {
63    uhash_close(nodes);
64    nodes=NULL;
65}
66
67void
68StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength,
69                       UErrorCode &errorCode) {
70    if(buildOption==USTRINGTRIE_BUILD_FAST) {
71        writeNode(0, elementsLength, 0);
72    } else /* USTRINGTRIE_BUILD_SMALL */ {
73        createCompactBuilder(2*elementsLength, errorCode);
74        Node *root=makeNode(0, elementsLength, 0, errorCode);
75        if(U_SUCCESS(errorCode)) {
76            root->markRightEdgesFirst(-1);
77            root->write(*this);
78        }
79        deleteCompactBuilder();
80    }
81}
82
83// Requires start<limit,
84// and all strings of the [start..limit[ elements must be sorted and
85// have a common prefix of length unitIndex.
86int32_t
87StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
88    UBool hasValue=FALSE;
89    int32_t value=0;
90    int32_t type;
91    if(unitIndex==getElementStringLength(start)) {
92        // An intermediate or final value.
93        value=getElementValue(start++);
94        if(start==limit) {
95            return writeValueAndFinal(value, TRUE);  // final-value node
96        }
97        hasValue=TRUE;
98    }
99    // Now all [start..limit[ strings are longer than unitIndex.
100    int32_t minUnit=getElementUnit(start, unitIndex);
101    int32_t maxUnit=getElementUnit(limit-1, unitIndex);
102    if(minUnit==maxUnit) {
103        // Linear-match node: All strings have the same character at unitIndex.
104        int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
105        writeNode(start, limit, lastUnitIndex);
106        // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
107        int32_t length=lastUnitIndex-unitIndex;
108        int32_t maxLinearMatchLength=getMaxLinearMatchLength();
109        while(length>maxLinearMatchLength) {
110            lastUnitIndex-=maxLinearMatchLength;
111            length-=maxLinearMatchLength;
112            writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
113            write(getMinLinearMatch()+maxLinearMatchLength-1);
114        }
115        writeElementUnits(start, unitIndex, length);
116        type=getMinLinearMatch()+length-1;
117    } else {
118        // Branch node.
119        int32_t length=countElementUnits(start, limit, unitIndex);
120        // length>=2 because minUnit!=maxUnit.
121        writeBranchSubNode(start, limit, unitIndex, length);
122        if(--length<getMinLinearMatch()) {
123            type=length;
124        } else {
125            write(length);
126            type=0;
127        }
128    }
129    return writeValueAndType(hasValue, value, type);
130}
131
132// start<limit && all strings longer than unitIndex &&
133// length different units at unitIndex
134int32_t
135StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
136    UChar middleUnits[kMaxSplitBranchLevels];
137    int32_t lessThan[kMaxSplitBranchLevels];
138    int32_t ltLength=0;
139    while(length>getMaxBranchLinearSubNodeLength()) {
140        // Branch on the middle unit.
141        // First, find the middle unit.
142        int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
143        // Encode the less-than branch first.
144        middleUnits[ltLength]=getElementUnit(i, unitIndex);  // middle unit
145        lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
146        ++ltLength;
147        // Continue for the greater-or-equal branch.
148        start=i;
149        length=length-length/2;
150    }
151    // For each unit, find its elements array start and whether it has a final value.
152    int32_t starts[kMaxBranchLinearSubNodeLength];
153    UBool isFinal[kMaxBranchLinearSubNodeLength-1];
154    int32_t unitNumber=0;
155    do {
156        int32_t i=starts[unitNumber]=start;
157        UChar unit=getElementUnit(i++, unitIndex);
158        i=indexOfElementWithNextUnit(i, unitIndex, unit);
159        isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
160        start=i;
161    } while(++unitNumber<length-1);
162    // unitNumber==length-1, and the maxUnit elements range is [start..limit[
163    starts[unitNumber]=start;
164
165    // Write the sub-nodes in reverse order: The jump lengths are deltas from
166    // after their own positions, so if we wrote the minUnit sub-node first,
167    // then its jump delta would be larger.
168    // Instead we write the minUnit sub-node last, for a shorter delta.
169    int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
170    do {
171        --unitNumber;
172        if(!isFinal[unitNumber]) {
173            jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
174        }
175    } while(unitNumber>0);
176    // The maxUnit sub-node is written as the very last one because we do
177    // not jump for it at all.
178    unitNumber=length-1;
179    writeNode(start, limit, unitIndex+1);
180    int32_t offset=write(getElementUnit(start, unitIndex));
181    // Write the rest of this node's unit-value pairs.
182    while(--unitNumber>=0) {
183        start=starts[unitNumber];
184        int32_t value;
185        if(isFinal[unitNumber]) {
186            // Write the final value for the one string ending with this unit.
187            value=getElementValue(start);
188        } else {
189            // Write the delta to the start position of the sub-node.
190            value=offset-jumpTargets[unitNumber];
191        }
192        writeValueAndFinal(value, isFinal[unitNumber]);
193        offset=write(getElementUnit(start, unitIndex));
194    }
195    // Write the split-branch nodes.
196    while(ltLength>0) {
197        --ltLength;
198        writeDeltaTo(lessThan[ltLength]);
199        offset=write(middleUnits[ltLength]);
200    }
201    return offset;
202}
203
204// Requires start<limit,
205// and all strings of the [start..limit[ elements must be sorted and
206// have a common prefix of length unitIndex.
207StringTrieBuilder::Node *
208StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
209    if(U_FAILURE(errorCode)) {
210        return NULL;
211    }
212    UBool hasValue=FALSE;
213    int32_t value=0;
214    if(unitIndex==getElementStringLength(start)) {
215        // An intermediate or final value.
216        value=getElementValue(start++);
217        if(start==limit) {
218            return registerFinalValue(value, errorCode);
219        }
220        hasValue=TRUE;
221    }
222    Node *node;
223    // Now all [start..limit[ strings are longer than unitIndex.
224    int32_t minUnit=getElementUnit(start, unitIndex);
225    int32_t maxUnit=getElementUnit(limit-1, unitIndex);
226    if(minUnit==maxUnit) {
227        // Linear-match node: All strings have the same character at unitIndex.
228        int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
229        Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
230        // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
231        int32_t length=lastUnitIndex-unitIndex;
232        int32_t maxLinearMatchLength=getMaxLinearMatchLength();
233        while(length>maxLinearMatchLength) {
234            lastUnitIndex-=maxLinearMatchLength;
235            length-=maxLinearMatchLength;
236            node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
237            nextNode=registerNode(node, errorCode);
238        }
239        node=createLinearMatchNode(start, unitIndex, length, nextNode);
240    } else {
241        // Branch node.
242        int32_t length=countElementUnits(start, limit, unitIndex);
243        // length>=2 because minUnit!=maxUnit.
244        Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
245        node=new BranchHeadNode(length, subNode);
246    }
247    if(hasValue && node!=NULL) {
248        if(matchNodesCanHaveValues()) {
249            ((ValueNode *)node)->setValue(value);
250        } else {
251            node=new IntermediateValueNode(value, registerNode(node, errorCode));
252        }
253    }
254    return registerNode(node, errorCode);
255}
256
257// start<limit && all strings longer than unitIndex &&
258// length different units at unitIndex
259StringTrieBuilder::Node *
260StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
261                                   int32_t length, UErrorCode &errorCode) {
262    if(U_FAILURE(errorCode)) {
263        return NULL;
264    }
265    UChar middleUnits[kMaxSplitBranchLevels];
266    Node *lessThan[kMaxSplitBranchLevels];
267    int32_t ltLength=0;
268    while(length>getMaxBranchLinearSubNodeLength()) {
269        // Branch on the middle unit.
270        // First, find the middle unit.
271        int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
272        // Create the less-than branch.
273        middleUnits[ltLength]=getElementUnit(i, unitIndex);  // middle unit
274        lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
275        ++ltLength;
276        // Continue for the greater-or-equal branch.
277        start=i;
278        length=length-length/2;
279    }
280    if(U_FAILURE(errorCode)) {
281        return NULL;
282    }
283    ListBranchNode *listNode=new ListBranchNode();
284    if(listNode==NULL) {
285        errorCode=U_MEMORY_ALLOCATION_ERROR;
286        return NULL;
287    }
288    // For each unit, find its elements array start and whether it has a final value.
289    int32_t unitNumber=0;
290    do {
291        int32_t i=start;
292        UChar unit=getElementUnit(i++, unitIndex);
293        i=indexOfElementWithNextUnit(i, unitIndex, unit);
294        if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
295            listNode->add(unit, getElementValue(start));
296        } else {
297            listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
298        }
299        start=i;
300    } while(++unitNumber<length-1);
301    // unitNumber==length-1, and the maxUnit elements range is [start..limit[
302    UChar unit=getElementUnit(start, unitIndex);
303    if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
304        listNode->add(unit, getElementValue(start));
305    } else {
306        listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
307    }
308    Node *node=registerNode(listNode, errorCode);
309    // Create the split-branch nodes.
310    while(ltLength>0) {
311        --ltLength;
312        node=registerNode(
313            new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
314    }
315    return node;
316}
317
318StringTrieBuilder::Node *
319StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
320    if(U_FAILURE(errorCode)) {
321        delete newNode;
322        return NULL;
323    }
324    if(newNode==NULL) {
325        errorCode=U_MEMORY_ALLOCATION_ERROR;
326        return NULL;
327    }
328    const UHashElement *old=uhash_find(nodes, newNode);
329    if(old!=NULL) {
330        delete newNode;
331        return (Node *)old->key.pointer;
332    }
333    // If uhash_puti() returns a non-zero value from an equivalent, previously
334    // registered node, then uhash_find() failed to find that and we will leak newNode.
335#if U_DEBUG
336    int32_t oldValue=  // Only in debug mode to avoid a compiler warning about unused oldValue.
337#endif
338    uhash_puti(nodes, newNode, 1, &errorCode);
339    U_ASSERT(oldValue==0);
340    if(U_FAILURE(errorCode)) {
341        delete newNode;
342        return NULL;
343    }
344    return newNode;
345}
346
347StringTrieBuilder::Node *
348StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
349    if(U_FAILURE(errorCode)) {
350        return NULL;
351    }
352    FinalValueNode key(value);
353    const UHashElement *old=uhash_find(nodes, &key);
354    if(old!=NULL) {
355        return (Node *)old->key.pointer;
356    }
357    Node *newNode=new FinalValueNode(value);
358    if(newNode==NULL) {
359        errorCode=U_MEMORY_ALLOCATION_ERROR;
360        return NULL;
361    }
362    // If uhash_puti() returns a non-zero value from an equivalent, previously
363    // registered node, then uhash_find() failed to find that and we will leak newNode.
364#if U_DEBUG
365    int32_t oldValue=  // Only in debug mode to avoid a compiler warning about unused oldValue.
366#endif
367    uhash_puti(nodes, newNode, 1, &errorCode);
368    U_ASSERT(oldValue==0);
369    if(U_FAILURE(errorCode)) {
370        delete newNode;
371        return NULL;
372    }
373    return newNode;
374}
375
376UBool
377StringTrieBuilder::hashNode(const void *node) {
378    return ((const Node *)node)->hashCode();
379}
380
381UBool
382StringTrieBuilder::equalNodes(const void *left, const void *right) {
383    return *(const Node *)left==*(const Node *)right;
384}
385
386UBool
387StringTrieBuilder::Node::operator==(const Node &other) const {
388    return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
389}
390
391int32_t
392StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
393    if(offset==0) {
394        offset=edgeNumber;
395    }
396    return edgeNumber;
397}
398
399UBool
400StringTrieBuilder::FinalValueNode::operator==(const Node &other) const {
401    if(this==&other) {
402        return TRUE;
403    }
404    if(!Node::operator==(other)) {
405        return FALSE;
406    }
407    const FinalValueNode &o=(const FinalValueNode &)other;
408    return value==o.value;
409}
410
411void
412StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) {
413    offset=builder.writeValueAndFinal(value, TRUE);
414}
415
416UBool
417StringTrieBuilder::ValueNode::operator==(const Node &other) const {
418    if(this==&other) {
419        return TRUE;
420    }
421    if(!Node::operator==(other)) {
422        return FALSE;
423    }
424    const ValueNode &o=(const ValueNode &)other;
425    return hasValue==o.hasValue && (!hasValue || value==o.value);
426}
427
428UBool
429StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
430    if(this==&other) {
431        return TRUE;
432    }
433    if(!ValueNode::operator==(other)) {
434        return FALSE;
435    }
436    const IntermediateValueNode &o=(const IntermediateValueNode &)other;
437    return next==o.next;
438}
439
440int32_t
441StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
442    if(offset==0) {
443        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
444    }
445    return edgeNumber;
446}
447
448void
449StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) {
450    next->write(builder);
451    offset=builder.writeValueAndFinal(value, FALSE);
452}
453
454UBool
455StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
456    if(this==&other) {
457        return TRUE;
458    }
459    if(!ValueNode::operator==(other)) {
460        return FALSE;
461    }
462    const LinearMatchNode &o=(const LinearMatchNode &)other;
463    return length==o.length && next==o.next;
464}
465
466int32_t
467StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
468    if(offset==0) {
469        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
470    }
471    return edgeNumber;
472}
473
474UBool
475StringTrieBuilder::ListBranchNode::operator==(const Node &other) const {
476    if(this==&other) {
477        return TRUE;
478    }
479    if(!Node::operator==(other)) {
480        return FALSE;
481    }
482    const ListBranchNode &o=(const ListBranchNode &)other;
483    for(int32_t i=0; i<length; ++i) {
484        if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
485            return FALSE;
486        }
487    }
488    return TRUE;
489}
490
491int32_t
492StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
493    if(offset==0) {
494        firstEdgeNumber=edgeNumber;
495        int32_t step=0;
496        int32_t i=length;
497        do {
498            Node *edge=equal[--i];
499            if(edge!=NULL) {
500                edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
501            }
502            // For all but the rightmost edge, decrement the edge number.
503            step=1;
504        } while(i>0);
505        offset=edgeNumber;
506    }
507    return edgeNumber;
508}
509
510void
511StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) {
512    // Write the sub-nodes in reverse order: The jump lengths are deltas from
513    // after their own positions, so if we wrote the minUnit sub-node first,
514    // then its jump delta would be larger.
515    // Instead we write the minUnit sub-node last, for a shorter delta.
516    int32_t unitNumber=length-1;
517    Node *rightEdge=equal[unitNumber];
518    int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
519    do {
520        --unitNumber;
521        if(equal[unitNumber]!=NULL) {
522            equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
523        }
524    } while(unitNumber>0);
525    // The maxUnit sub-node is written as the very last one because we do
526    // not jump for it at all.
527    unitNumber=length-1;
528    if(rightEdge==NULL) {
529        builder.writeValueAndFinal(values[unitNumber], TRUE);
530    } else {
531        rightEdge->write(builder);
532    }
533    offset=builder.write(units[unitNumber]);
534    // Write the rest of this node's unit-value pairs.
535    while(--unitNumber>=0) {
536        int32_t value;
537        UBool isFinal;
538        if(equal[unitNumber]==NULL) {
539            // Write the final value for the one string ending with this unit.
540            value=values[unitNumber];
541            isFinal=TRUE;
542        } else {
543            // Write the delta to the start position of the sub-node.
544            U_ASSERT(equal[unitNumber]->getOffset()>0);
545            value=offset-equal[unitNumber]->getOffset();
546            isFinal=FALSE;
547        }
548        builder.writeValueAndFinal(value, isFinal);
549        offset=builder.write(units[unitNumber]);
550    }
551}
552
553UBool
554StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
555    if(this==&other) {
556        return TRUE;
557    }
558    if(!Node::operator==(other)) {
559        return FALSE;
560    }
561    const SplitBranchNode &o=(const SplitBranchNode &)other;
562    return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
563}
564
565int32_t
566StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
567    if(offset==0) {
568        firstEdgeNumber=edgeNumber;
569        edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
570        offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
571    }
572    return edgeNumber;
573}
574
575void
576StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) {
577    // Encode the less-than branch first.
578    lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
579    // Encode the greater-or-equal branch last because we do not jump for it at all.
580    greaterOrEqual->write(builder);
581    // Write this node.
582    U_ASSERT(lessThan->getOffset()>0);
583    builder.writeDeltaTo(lessThan->getOffset());  // less-than
584    offset=builder.write(unit);
585}
586
587UBool
588StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
589    if(this==&other) {
590        return TRUE;
591    }
592    if(!ValueNode::operator==(other)) {
593        return FALSE;
594    }
595    const BranchHeadNode &o=(const BranchHeadNode &)other;
596    return length==o.length && next==o.next;
597}
598
599int32_t
600StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
601    if(offset==0) {
602        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
603    }
604    return edgeNumber;
605}
606
607void
608StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) {
609    next->write(builder);
610    if(length<=builder.getMinLinearMatch()) {
611        offset=builder.writeValueAndType(hasValue, value, length-1);
612    } else {
613        builder.write(length-1);
614        offset=builder.writeValueAndType(hasValue, value, 0);
615    }
616}
617
618U_NAMESPACE_END
619