1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 2010-2012,2014, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  stringtriebuilder.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2010dec24
14*   created by: Markus W. Scherer
15*/
16
17#ifndef __STRINGTRIEBUILDER_H__
18#define __STRINGTRIEBUILDER_H__
19
20#include "unicode/utypes.h"
21#include "unicode/uobject.h"
22
23/**
24 * \file
25 * \brief C++ API: Builder API for trie builders
26 */
27
28// Forward declaration.
29struct UHashtable;
30typedef struct UHashtable UHashtable;
31
32/**
33 * Build options for BytesTrieBuilder and CharsTrieBuilder.
34 * @stable ICU 4.8
35 */
36enum UStringTrieBuildOption {
37    /**
38     * Builds a trie quickly.
39     * @stable ICU 4.8
40     */
41    USTRINGTRIE_BUILD_FAST,
42    /**
43     * Builds a trie more slowly, attempting to generate
44     * a shorter but equivalent serialization.
45     * This build option also uses more memory.
46     *
47     * This option can be effective when many integer values are the same
48     * and string/byte sequence suffixes can be shared.
49     * Runtime speed is not expected to improve.
50     * @stable ICU 4.8
51     */
52    USTRINGTRIE_BUILD_SMALL
53};
54
55U_NAMESPACE_BEGIN
56
57/**
58 * Base class for string trie builder classes.
59 *
60 * This class is not intended for public subclassing.
61 * @stable ICU 4.8
62 */
63class U_COMMON_API StringTrieBuilder : public UObject {
64public:
65#ifndef U_HIDE_INTERNAL_API
66    /** @internal */
67    static UBool hashNode(const void *node);
68    /** @internal */
69    static UBool equalNodes(const void *left, const void *right);
70#endif  /* U_HIDE_INTERNAL_API */
71
72protected:
73    // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
74    // or else the compiler will create a public default constructor.
75    /** @internal */
76    StringTrieBuilder();
77    /** @internal */
78    virtual ~StringTrieBuilder();
79
80#ifndef U_HIDE_INTERNAL_API
81    /** @internal */
82    void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
83    /** @internal */
84    void deleteCompactBuilder();
85
86    /** @internal */
87    void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
88
89    /** @internal */
90    int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
91    /** @internal */
92    int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
93#endif  /* U_HIDE_INTERNAL_API */
94
95    class Node;
96
97#ifndef U_HIDE_INTERNAL_API
98    /** @internal */
99    Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
100    /** @internal */
101    Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
102                            int32_t length, UErrorCode &errorCode);
103#endif  /* U_HIDE_INTERNAL_API */
104
105    /** @internal */
106    virtual int32_t getElementStringLength(int32_t i) const = 0;
107    /** @internal */
108    virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
109    /** @internal */
110    virtual int32_t getElementValue(int32_t i) const = 0;
111
112    // Finds the first unit index after this one where
113    // the first and last element have different units again.
114    /** @internal */
115    virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
116
117    // Number of different units at unitIndex.
118    /** @internal */
119    virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
120    /** @internal */
121    virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
122    /** @internal */
123    virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
124
125    /** @internal */
126    virtual UBool matchNodesCanHaveValues() const = 0;
127
128    /** @internal */
129    virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
130    /** @internal */
131    virtual int32_t getMinLinearMatch() const = 0;
132    /** @internal */
133    virtual int32_t getMaxLinearMatchLength() const = 0;
134
135#ifndef U_HIDE_INTERNAL_API
136    // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
137    /** @internal */
138    static const int32_t kMaxBranchLinearSubNodeLength=5;
139
140    // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
141    // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
142    /** @internal */
143    static const int32_t kMaxSplitBranchLevels=14;
144
145    /**
146     * Makes sure that there is only one unique node registered that is
147     * equivalent to newNode.
148     * @param newNode Input node. The builder takes ownership.
149     * @param errorCode ICU in/out UErrorCode.
150                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
151     * @return newNode if it is the first of its kind, or
152     *         an equivalent node if newNode is a duplicate.
153     * @internal
154     */
155    Node *registerNode(Node *newNode, UErrorCode &errorCode);
156    /**
157     * Makes sure that there is only one unique FinalValueNode registered
158     * with this value.
159     * Avoids creating a node if the value is a duplicate.
160     * @param value A final value.
161     * @param errorCode ICU in/out UErrorCode.
162                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
163     * @return A FinalValueNode with the given value.
164     * @internal
165     */
166    Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
167#endif  /* U_HIDE_INTERNAL_API */
168
169    /*
170     * C++ note:
171     * registerNode() and registerFinalValue() take ownership of their input nodes,
172     * and only return owned nodes.
173     * If they see a failure UErrorCode, they will delete the input node.
174     * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
175     * If there is a failure, they return NULL.
176     *
177     * NULL Node pointers can be safely passed into other Nodes because
178     * they call the static Node::hashCode() which checks for a NULL pointer first.
179     *
180     * Therefore, as long as builder functions register a new node,
181     * they need to check for failures only before explicitly dereferencing
182     * a Node pointer, or before setting a new UErrorCode.
183     */
184
185    // Hash set of nodes, maps from nodes to integer 1.
186    /** @internal */
187    UHashtable *nodes;
188
189    // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
190    // it is needed for layout of other objects.
191    /** @internal */
192    class Node : public UObject {
193    public:
194        Node(int32_t initialHash) : hash(initialHash), offset(0) {}
195        inline int32_t hashCode() const { return hash; }
196        // Handles node==NULL.
197        static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
198        // Base class operator==() compares the actual class types.
199        virtual UBool operator==(const Node &other) const;
200        inline UBool operator!=(const Node &other) const { return !operator==(other); }
201        /**
202         * Traverses the Node graph and numbers branch edges, with rightmost edges first.
203         * This is to avoid writing a duplicate node twice.
204         *
205         * Branch nodes in this trie data structure are not symmetric.
206         * Most branch edges "jump" to other nodes but the rightmost branch edges
207         * just continue without a jump.
208         * Therefore, write() must write the rightmost branch edge last
209         * (trie units are written backwards), and must write it at that point even if
210         * it is a duplicate of a node previously written elsewhere.
211         *
212         * This function visits and marks right branch edges first.
213         * Edges are numbered with increasingly negative values because we share the
214         * offset field which gets positive values when nodes are written.
215         * A branch edge also remembers the first number for any of its edges.
216         *
217         * When a further-left branch edge has a number in the range of the rightmost
218         * edge's numbers, then it will be written as part of the required right edge
219         * and we can avoid writing it first.
220         *
221         * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
222         * edge numbers.
223         *
224         * @param edgeNumber The first edge number for this node and its sub-nodes.
225         * @return An edge number that is at least the maximum-negative
226         *         of the input edge number and the numbers of this node and all of its sub-nodes.
227         */
228        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
229        // write() must set the offset to a positive value.
230        virtual void write(StringTrieBuilder &builder) = 0;
231        // See markRightEdgesFirst.
232        inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
233                                               StringTrieBuilder &builder) {
234            // Note: Edge numbers are negative, lastRight<=firstRight.
235            // If offset>0 then this node and its sub-nodes have been written already
236            // and we need not write them again.
237            // If this node is part of the unwritten right branch edge,
238            // then we wait until that is written.
239            if(offset<0 && (offset<lastRight || firstRight<offset)) {
240                write(builder);
241            }
242        }
243        inline int32_t getOffset() const { return offset; }
244    protected:
245        int32_t hash;
246        int32_t offset;
247    };
248
249#ifndef U_HIDE_INTERNAL_API
250    // This class should not be overridden because
251    // registerFinalValue() compares a stack-allocated FinalValueNode
252    // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
253    // with the input node, and the
254    // !Node::operator==(other) used inside FinalValueNode::operator==(other)
255    // will be false if the typeid's are different.
256    /** @internal */
257    class FinalValueNode : public Node {
258    public:
259        FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
260        virtual UBool operator==(const Node &other) const;
261        virtual void write(StringTrieBuilder &builder);
262    protected:
263        int32_t value;
264    };
265#endif  /* U_HIDE_INTERNAL_API */
266
267    // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
268    // it is needed for layout of other objects.
269    /**
270     * @internal
271     */
272    class ValueNode : public Node {
273    public:
274        ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
275        virtual UBool operator==(const Node &other) const;
276        void setValue(int32_t v) {
277            hasValue=TRUE;
278            value=v;
279            hash=hash*37+v;
280        }
281    protected:
282        UBool hasValue;
283        int32_t value;
284    };
285
286#ifndef U_HIDE_INTERNAL_API
287    /**
288     * @internal
289     */
290    class IntermediateValueNode : public ValueNode {
291    public:
292        IntermediateValueNode(int32_t v, Node *nextNode)
293                : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
294        virtual UBool operator==(const Node &other) const;
295        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
296        virtual void write(StringTrieBuilder &builder);
297    protected:
298        Node *next;
299    };
300#endif  /* U_HIDE_INTERNAL_API */
301
302    // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
303    // it is needed for layout of other objects.
304    /**
305     * @internal
306     */
307    class LinearMatchNode : public ValueNode {
308    public:
309        LinearMatchNode(int32_t len, Node *nextNode)
310                : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
311                  length(len), next(nextNode) {}
312        virtual UBool operator==(const Node &other) const;
313        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
314    protected:
315        int32_t length;
316        Node *next;
317    };
318
319#ifndef U_HIDE_INTERNAL_API
320    /**
321     * @internal
322     */
323    class BranchNode : public Node {
324    public:
325        BranchNode(int32_t initialHash) : Node(initialHash) {}
326    protected:
327        int32_t firstEdgeNumber;
328    };
329
330    /**
331     * @internal
332     */
333    class ListBranchNode : public BranchNode {
334    public:
335        ListBranchNode() : BranchNode(0x444444), length(0) {}
336        virtual UBool operator==(const Node &other) const;
337        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
338        virtual void write(StringTrieBuilder &builder);
339        // Adds a unit with a final value.
340        void add(int32_t c, int32_t value) {
341            units[length]=(UChar)c;
342            equal[length]=NULL;
343            values[length]=value;
344            ++length;
345            hash=(hash*37+c)*37+value;
346        }
347        // Adds a unit which leads to another match node.
348        void add(int32_t c, Node *node) {
349            units[length]=(UChar)c;
350            equal[length]=node;
351            values[length]=0;
352            ++length;
353            hash=(hash*37+c)*37+hashCode(node);
354        }
355    protected:
356        Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
357        int32_t length;
358        int32_t values[kMaxBranchLinearSubNodeLength];
359        UChar units[kMaxBranchLinearSubNodeLength];
360    };
361
362    /**
363     * @internal
364     */
365    class SplitBranchNode : public BranchNode {
366    public:
367        SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
368                : BranchNode(((0x555555*37+middleUnit)*37+
369                              hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
370                  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
371        virtual UBool operator==(const Node &other) const;
372        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
373        virtual void write(StringTrieBuilder &builder);
374    protected:
375        UChar unit;
376        Node *lessThan;
377        Node *greaterOrEqual;
378    };
379
380    // Branch head node, for writing the actual node lead unit.
381    /** @internal */
382    class BranchHeadNode : public ValueNode {
383    public:
384        BranchHeadNode(int32_t len, Node *subNode)
385                : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
386                  length(len), next(subNode) {}
387        virtual UBool operator==(const Node &other) const;
388        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
389        virtual void write(StringTrieBuilder &builder);
390    protected:
391        int32_t length;
392        Node *next;  // A branch sub-node.
393    };
394#endif  /* U_HIDE_INTERNAL_API */
395
396    /** @internal */
397    virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
398                                        Node *nextNode) const = 0;
399
400    /** @internal */
401    virtual int32_t write(int32_t unit) = 0;
402    /** @internal */
403    virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
404    /** @internal */
405    virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
406    /** @internal */
407    virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
408    /** @internal */
409    virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
410};
411
412U_NAMESPACE_END
413
414#endif  // __STRINGTRIEBUILDER_H__
415