1/*
2*******************************************************************************
3*   Copyright (C) 2010-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  stringtriebuilder.h
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010dec24
12*   created by: Markus W. Scherer
13*/
14
15#ifndef __STRINGTRIEBUILDER_H__
16#define __STRINGTRIEBUILDER_H__
17
18#include "unicode/utypes.h"
19#include "unicode/uobject.h"
20
21// Forward declaration.
22struct UHashtable;
23typedef struct UHashtable UHashtable;
24
25/**
26 * Build options for BytesTrieBuilder and CharsTrieBuilder.
27 * @draft ICU 4.8
28 */
29enum UStringTrieBuildOption {
30    /**
31     * Builds a trie quickly.
32     * @draft ICU 4.8
33     */
34    USTRINGTRIE_BUILD_FAST,
35    /**
36     * Builds a trie more slowly, attempting to generate
37     * a shorter but equivalent serialization.
38     * This build option also uses more memory.
39     *
40     * This option can be effective when many integer values are the same
41     * and string/byte sequence suffixes can be shared.
42     * Runtime speed is not expected to improve.
43     * @draft ICU 4.8
44     */
45    USTRINGTRIE_BUILD_SMALL
46};
47
48U_NAMESPACE_BEGIN
49
50/**
51 * Base class for string trie builder classes.
52 *
53 * This class is not intended for public subclassing.
54 * @draft ICU 4.8
55 */
56class U_COMMON_API StringTrieBuilder : public UObject {
57public:
58    /** @internal */
59    static UBool hashNode(const void *node);
60    /** @internal */
61    static UBool equalNodes(const void *left, const void *right);
62
63protected:
64    /** @internal */
65    StringTrieBuilder();
66    /** @internal */
67    virtual ~StringTrieBuilder();
68
69    /** @internal */
70    void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
71    /** @internal */
72    void deleteCompactBuilder();
73
74    /** @internal */
75    void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
76
77    /** @internal */
78    int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
79    /** @internal */
80    int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
81
82    class Node;
83
84    /** @internal */
85    Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
86    /** @internal */
87    Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
88                            int32_t length, UErrorCode &errorCode);
89
90    /** @internal */
91    virtual int32_t getElementStringLength(int32_t i) const = 0;
92    /** @internal */
93    virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
94    /** @internal */
95    virtual int32_t getElementValue(int32_t i) const = 0;
96
97    // Finds the first unit index after this one where
98    // the first and last element have different units again.
99    /** @internal */
100    virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
101
102    // Number of different units at unitIndex.
103    /** @internal */
104    virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
105    /** @internal */
106    virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
107    /** @internal */
108    virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
109
110    /** @internal */
111    virtual UBool matchNodesCanHaveValues() const = 0;
112
113    /** @internal */
114    virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
115    /** @internal */
116    virtual int32_t getMinLinearMatch() const = 0;
117    /** @internal */
118    virtual int32_t getMaxLinearMatchLength() const = 0;
119
120    // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
121    /** @internal */
122    static const int32_t kMaxBranchLinearSubNodeLength=5;
123
124    // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
125    // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
126    /** @internal */
127    static const int32_t kMaxSplitBranchLevels=14;
128
129    /**
130     * Makes sure that there is only one unique node registered that is
131     * equivalent to newNode.
132     * @param newNode Input node. The builder takes ownership.
133     * @param errorCode ICU in/out UErrorCode.
134                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
135     * @return newNode if it is the first of its kind, or
136     *         an equivalent node if newNode is a duplicate.
137     * @internal
138     */
139    Node *registerNode(Node *newNode, UErrorCode &errorCode);
140    /**
141     * Makes sure that there is only one unique FinalValueNode registered
142     * with this value.
143     * Avoids creating a node if the value is a duplicate.
144     * @param value A final value.
145     * @param errorCode ICU in/out UErrorCode.
146                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
147     * @return A FinalValueNode with the given value.
148     * @internal
149     */
150    Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
151
152    /*
153     * C++ note:
154     * registerNode() and registerFinalValue() take ownership of their input nodes,
155     * and only return owned nodes.
156     * If they see a failure UErrorCode, they will delete the input node.
157     * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
158     * If there is a failure, they return NULL.
159     *
160     * NULL Node pointers can be safely passed into other Nodes because
161     * they call the static Node::hashCode() which checks for a NULL pointer first.
162     *
163     * Therefore, as long as builder functions register a new node,
164     * they need to check for failures only before explicitly dereferencing
165     * a Node pointer, or before setting a new UErrorCode.
166     */
167
168    // Hash set of nodes, maps from nodes to integer 1.
169    /** @internal */
170    UHashtable *nodes;
171
172    /** @internal */
173    class Node : public UObject {
174    public:
175        Node(int32_t initialHash) : hash(initialHash), offset(0) {}
176        inline int32_t hashCode() const { return hash; }
177        // Handles node==NULL.
178        static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
179        // Base class operator==() compares the actual class types.
180        virtual UBool operator==(const Node &other) const;
181        inline UBool operator!=(const Node &other) const { return !operator==(other); }
182        /**
183         * Traverses the Node graph and numbers branch edges, with rightmost edges first.
184         * This is to avoid writing a duplicate node twice.
185         *
186         * Branch nodes in this trie data structure are not symmetric.
187         * Most branch edges "jump" to other nodes but the rightmost branch edges
188         * just continue without a jump.
189         * Therefore, write() must write the rightmost branch edge last
190         * (trie units are written backwards), and must write it at that point even if
191         * it is a duplicate of a node previously written elsewhere.
192         *
193         * This function visits and marks right branch edges first.
194         * Edges are numbered with increasingly negative values because we share the
195         * offset field which gets positive values when nodes are written.
196         * A branch edge also remembers the first number for any of its edges.
197         *
198         * When a further-left branch edge has a number in the range of the rightmost
199         * edge's numbers, then it will be written as part of the required right edge
200         * and we can avoid writing it first.
201         *
202         * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
203         * edge numbers.
204         *
205         * @param edgeNumber The first edge number for this node and its sub-nodes.
206         * @return An edge number that is at least the maximum-negative
207         *         of the input edge number and the numbers of this node and all of its sub-nodes.
208         */
209        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
210        // write() must set the offset to a positive value.
211        virtual void write(StringTrieBuilder &builder) = 0;
212        // See markRightEdgesFirst.
213        inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
214                                               StringTrieBuilder &builder) {
215            // Note: Edge numbers are negative, lastRight<=firstRight.
216            // If offset>0 then this node and its sub-nodes have been written already
217            // and we need not write them again.
218            // If this node is part of the unwritten right branch edge,
219            // then we wait until that is written.
220            if(offset<0 && (offset<lastRight || firstRight<offset)) {
221                write(builder);
222            }
223        }
224        inline int32_t getOffset() const { return offset; }
225    protected:
226        int32_t hash;
227        int32_t offset;
228    private:
229        // No ICU "poor man's RTTI" for this class nor its subclasses.
230        virtual UClassID getDynamicClassID() const;
231    };
232
233    // This class should not be overridden because
234    // registerFinalValue() compares a stack-allocated FinalValueNode
235    // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
236    // with the input node, and the
237    // !Node::operator==(other) used inside FinalValueNode::operator==(other)
238    // will be false if the typeid's are different.
239    /** @internal */
240    class FinalValueNode : public Node {
241    public:
242        FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
243        virtual UBool operator==(const Node &other) const;
244        virtual void write(StringTrieBuilder &builder);
245    protected:
246        int32_t value;
247    };
248
249    /** @internal */
250    class ValueNode : public Node {
251    public:
252        ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
253        virtual UBool operator==(const Node &other) const;
254        void setValue(int32_t v) {
255            hasValue=TRUE;
256            value=v;
257            hash=hash*37+v;
258        }
259    protected:
260        UBool hasValue;
261        int32_t value;
262    };
263
264    /** @internal */
265    class IntermediateValueNode : public ValueNode {
266    public:
267        IntermediateValueNode(int32_t v, Node *nextNode)
268                : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
269        virtual UBool operator==(const Node &other) const;
270        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
271        virtual void write(StringTrieBuilder &builder);
272    protected:
273        Node *next;
274    };
275
276    /** @internal */
277    class LinearMatchNode : public ValueNode {
278    public:
279        LinearMatchNode(int32_t len, Node *nextNode)
280                : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
281                  length(len), next(nextNode) {}
282        virtual UBool operator==(const Node &other) const;
283        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
284    protected:
285        int32_t length;
286        Node *next;
287    };
288
289    /** @internal */
290    class BranchNode : public Node {
291    public:
292        BranchNode(int32_t initialHash) : Node(initialHash) {}
293    protected:
294        int32_t firstEdgeNumber;
295    };
296
297    /** @internal */
298    class ListBranchNode : public BranchNode {
299    public:
300        ListBranchNode() : BranchNode(0x444444), length(0) {}
301        virtual UBool operator==(const Node &other) const;
302        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
303        virtual void write(StringTrieBuilder &builder);
304        // Adds a unit with a final value.
305        void add(int32_t c, int32_t value) {
306            units[length]=(UChar)c;
307            equal[length]=NULL;
308            values[length]=value;
309            ++length;
310            hash=(hash*37+c)*37+value;
311        }
312        // Adds a unit which leads to another match node.
313        void add(int32_t c, Node *node) {
314            units[length]=(UChar)c;
315            equal[length]=node;
316            values[length]=0;
317            ++length;
318            hash=(hash*37+c)*37+hashCode(node);
319        }
320    protected:
321        Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
322        int32_t length;
323        int32_t values[kMaxBranchLinearSubNodeLength];
324        UChar units[kMaxBranchLinearSubNodeLength];
325    };
326
327    /** @internal */
328    class SplitBranchNode : public BranchNode {
329    public:
330        SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
331                : BranchNode(((0x555555*37+middleUnit)*37+
332                              hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
333                  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
334        virtual UBool operator==(const Node &other) const;
335        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
336        virtual void write(StringTrieBuilder &builder);
337    protected:
338        UChar unit;
339        Node *lessThan;
340        Node *greaterOrEqual;
341    };
342
343    // Branch head node, for writing the actual node lead unit.
344    /** @internal */
345    class BranchHeadNode : public ValueNode {
346    public:
347        BranchHeadNode(int32_t len, Node *subNode)
348                : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
349                  length(len), next(subNode) {}
350        virtual UBool operator==(const Node &other) const;
351        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
352        virtual void write(StringTrieBuilder &builder);
353    protected:
354        int32_t length;
355        Node *next;  // A branch sub-node.
356    };
357
358    /** @internal */
359    virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
360                                        Node *nextNode) const = 0;
361
362    /** @internal */
363    virtual int32_t write(int32_t unit) = 0;
364    /** @internal */
365    virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
366    /** @internal */
367    virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
368    /** @internal */
369    virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
370    /** @internal */
371    virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
372
373private:
374    // No ICU "poor man's RTTI" for this class nor its subclasses.
375    virtual UClassID getDynamicClassID() const;
376};
377
378U_NAMESPACE_END
379
380#endif  // __STRINGTRIEBUILDER_H__
381