1b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/*
2b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*******************************************************************************
3103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius*   Copyright (C) 2010-2012, International Business Machines
4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   Corporation and others.  All Rights Reserved.
5b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*******************************************************************************
6b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   file name:  stringtriebuilder.h
7b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   encoding:   US-ASCII
8b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   tab size:   8 (not used)
9b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   indentation:4
10b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*
11b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   created on: 2010dec24
12b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*   created by: Markus W. Scherer
13b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*/
14b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
15b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#ifndef __STRINGTRIEBUILDER_H__
16b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define __STRINGTRIEBUILDER_H__
17b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
18b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/utypes.h"
19b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/uobject.h"
20b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
2154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/**
2254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * \file
2354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * \brief C++ API: Builder API for trie builders
2454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */
2554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius
26b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Forward declaration.
27b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostruct UHashtable;
28b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehotypedef struct UHashtable UHashtable;
29b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
30b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/**
31b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Build options for BytesTrieBuilder and CharsTrieBuilder.
32103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8
33b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */
34b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoenum UStringTrieBuildOption {
35b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /**
36b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * Builds a trie quickly.
37103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius     * @stable ICU 4.8
38b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     */
39b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    USTRINGTRIE_BUILD_FAST,
40b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /**
41b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * Builds a trie more slowly, attempting to generate
42b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * a shorter but equivalent serialization.
43b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * This build option also uses more memory.
44b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     *
45b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * This option can be effective when many integer values are the same
46b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * and string/byte sequence suffixes can be shared.
47b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * Runtime speed is not expected to improve.
48103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius     * @stable ICU 4.8
49b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     */
50b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    USTRINGTRIE_BUILD_SMALL
51b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
52b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
53b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoU_NAMESPACE_BEGIN
54b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
55b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/**
56b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Base class for string trie builder classes.
57b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho *
58b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * This class is not intended for public subclassing.
59103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8
60b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */
61b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass U_COMMON_API StringTrieBuilder : public UObject {
62b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic:
63103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#ifndef U_HIDE_INTERNAL_API
64b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
65b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    static UBool hashNode(const void *node);
66b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
67b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    static UBool equalNodes(const void *left, const void *right);
68103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#endif  /* U_HIDE_INTERNAL_API */
69b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
70b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprotected:
71103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
72103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    // or else the compiler will create a public default constructor.
73b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
74b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    StringTrieBuilder();
75b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
76b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual ~StringTrieBuilder();
77b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
78103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#ifndef U_HIDE_INTERNAL_API
79b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
80b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
81b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
82b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    void deleteCompactBuilder();
83b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
84b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
85b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
86b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
87b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
88b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
89b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
90b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
91103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#endif  /* U_HIDE_INTERNAL_API */
92b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
93b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class Node;
94b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
95103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#ifndef U_HIDE_INTERNAL_API
96b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
97b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
98b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
99b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
100b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                            int32_t length, UErrorCode &errorCode);
101103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#endif  /* U_HIDE_INTERNAL_API */
102b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
103b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
104b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t getElementStringLength(int32_t i) const = 0;
105b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
106b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
107b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
108b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t getElementValue(int32_t i) const = 0;
109b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
110b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Finds the first unit index after this one where
111b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // the first and last element have different units again.
112b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
113b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
114b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
115b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Number of different units at unitIndex.
116b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
117b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
118b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
119b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
122b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
123b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
124b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual UBool matchNodesCanHaveValues() const = 0;
125b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
126b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
127b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
128b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
129b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t getMinLinearMatch() const = 0;
130b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t getMaxLinearMatchLength() const = 0;
132b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
133103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#ifndef U_HIDE_INTERNAL_API
134b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
135b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
136b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    static const int32_t kMaxBranchLinearSubNodeLength=5;
137b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
138b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
139b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
140b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
141b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    static const int32_t kMaxSplitBranchLevels=14;
142b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
143b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /**
144b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * Makes sure that there is only one unique node registered that is
145b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * equivalent to newNode.
146b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @param newNode Input node. The builder takes ownership.
147b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @param errorCode ICU in/out UErrorCode.
148b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
149b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @return newNode if it is the first of its kind, or
150b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     *         an equivalent node if newNode is a duplicate.
151b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @internal
152b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     */
153b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    Node *registerNode(Node *newNode, UErrorCode &errorCode);
154b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /**
155b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * Makes sure that there is only one unique FinalValueNode registered
156b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * with this value.
157b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * Avoids creating a node if the value is a duplicate.
158b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @param value A final value.
159b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @param errorCode ICU in/out UErrorCode.
160b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
161b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @return A FinalValueNode with the given value.
162b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * @internal
163b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     */
164b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
165b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
166b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /*
167b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * C++ note:
168b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * registerNode() and registerFinalValue() take ownership of their input nodes,
169b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * and only return owned nodes.
170b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * If they see a failure UErrorCode, they will delete the input node.
171b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
172b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * If there is a failure, they return NULL.
173b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     *
174b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * NULL Node pointers can be safely passed into other Nodes because
175b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * they call the static Node::hashCode() which checks for a NULL pointer first.
176b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     *
177b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * Therefore, as long as builder functions register a new node,
178b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * they need to check for failures only before explicitly dereferencing
179b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     * a Node pointer, or before setting a new UErrorCode.
180b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho     */
181b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
182b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Hash set of nodes, maps from nodes to integer 1.
183b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
184b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UHashtable *nodes;
185b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
186b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
187b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class Node : public UObject {
188b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
189b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        Node(int32_t initialHash) : hash(initialHash), offset(0) {}
190b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        inline int32_t hashCode() const { return hash; }
191b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Handles node==NULL.
192b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
193b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Base class operator==() compares the actual class types.
194b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual UBool operator==(const Node &other) const;
195b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        inline UBool operator!=(const Node &other) const { return !operator==(other); }
196b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        /**
197b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * Traverses the Node graph and numbers branch edges, with rightmost edges first.
198b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * This is to avoid writing a duplicate node twice.
199b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         *
200b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * Branch nodes in this trie data structure are not symmetric.
201b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * Most branch edges "jump" to other nodes but the rightmost branch edges
202b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * just continue without a jump.
203b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * Therefore, write() must write the rightmost branch edge last
204b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * (trie units are written backwards), and must write it at that point even if
205b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * it is a duplicate of a node previously written elsewhere.
206b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         *
207b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * This function visits and marks right branch edges first.
208b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * Edges are numbered with increasingly negative values because we share the
209b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * offset field which gets positive values when nodes are written.
210b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * A branch edge also remembers the first number for any of its edges.
211b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         *
212b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * When a further-left branch edge has a number in the range of the rightmost
213b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * edge's numbers, then it will be written as part of the required right edge
214b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * and we can avoid writing it first.
215b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         *
216b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
217b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * edge numbers.
218b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         *
219b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * @param edgeNumber The first edge number for this node and its sub-nodes.
220b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         * @return An edge number that is at least the maximum-negative
221b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         *         of the input edge number and the numbers of this node and all of its sub-nodes.
222b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho         */
223b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
224b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // write() must set the offset to a positive value.
225b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual void write(StringTrieBuilder &builder) = 0;
226b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // See markRightEdgesFirst.
227b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
228b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                               StringTrieBuilder &builder) {
229b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // Note: Edge numbers are negative, lastRight<=firstRight.
230b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // If offset>0 then this node and its sub-nodes have been written already
231b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // and we need not write them again.
232b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // If this node is part of the unwritten right branch edge,
233b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            // then we wait until that is written.
234b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            if(offset<0 && (offset<lastRight || firstRight<offset)) {
235b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                write(builder);
236b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            }
237b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
238b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        inline int32_t getOffset() const { return offset; }
239b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
240b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t hash;
241b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t offset;
242b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
243b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
244b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // This class should not be overridden because
245b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // registerFinalValue() compares a stack-allocated FinalValueNode
246b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
247b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // with the input node, and the
248b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // !Node::operator==(other) used inside FinalValueNode::operator==(other)
249b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // will be false if the typeid's are different.
250b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
251b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class FinalValueNode : public Node {
252b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
253b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
254b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual UBool operator==(const Node &other) const;
255b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual void write(StringTrieBuilder &builder);
256b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
257b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t value;
258b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
259b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
26054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    /**
26154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     * @internal
26254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     */
263b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class ValueNode : public Node {
264b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
265b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
266b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual UBool operator==(const Node &other) const;
267b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        void setValue(int32_t v) {
268b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            hasValue=TRUE;
269b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            value=v;
270b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            hash=hash*37+v;
271b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
272b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
273b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UBool hasValue;
274b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t value;
275b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
276b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
27754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    /**
27854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     * @internal
27954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     */
280b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class IntermediateValueNode : public ValueNode {
281b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
282b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        IntermediateValueNode(int32_t v, Node *nextNode)
283b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
284b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual UBool operator==(const Node &other) const;
285b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
286b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual void write(StringTrieBuilder &builder);
287b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
288b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        Node *next;
289b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
290b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
29154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    /**
29254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     * @internal
29354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     */
294b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class LinearMatchNode : public ValueNode {
295b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
296b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        LinearMatchNode(int32_t len, Node *nextNode)
297b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
298b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                  length(len), next(nextNode) {}
299b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual UBool operator==(const Node &other) const;
300b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
301b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
302b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t length;
303b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        Node *next;
304b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
305b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
30654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    /**
30754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     * @internal
30854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     */
309b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class BranchNode : public Node {
310b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
311b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        BranchNode(int32_t initialHash) : Node(initialHash) {}
312b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
313b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t firstEdgeNumber;
314b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
315b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
31654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    /**
31754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     * @internal
31854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     */
319b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class ListBranchNode : public BranchNode {
320b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
321b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        ListBranchNode() : BranchNode(0x444444), length(0) {}
322b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual UBool operator==(const Node &other) const;
323b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
324b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual void write(StringTrieBuilder &builder);
325b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Adds a unit with a final value.
326b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        void add(int32_t c, int32_t value) {
327b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            units[length]=(UChar)c;
328b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            equal[length]=NULL;
329b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            values[length]=value;
330b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            ++length;
331b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            hash=(hash*37+c)*37+value;
332b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
333b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        // Adds a unit which leads to another match node.
334b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        void add(int32_t c, Node *node) {
335b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            units[length]=(UChar)c;
336b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            equal[length]=node;
337b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            values[length]=0;
338b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            ++length;
339b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho            hash=(hash*37+c)*37+hashCode(node);
340b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        }
341b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
342b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
343b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t length;
344b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t values[kMaxBranchLinearSubNodeLength];
345b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UChar units[kMaxBranchLinearSubNodeLength];
346b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
347b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
34854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius    /**
34954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     * @internal
35054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius     */
351b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class SplitBranchNode : public BranchNode {
352b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
353b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
354b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                : BranchNode(((0x555555*37+middleUnit)*37+
355b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                              hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
356b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
357b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual UBool operator==(const Node &other) const;
358b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
359b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual void write(StringTrieBuilder &builder);
360b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
361b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        UChar unit;
362b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        Node *lessThan;
363b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        Node *greaterOrEqual;
364b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
365b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
366b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    // Branch head node, for writing the actual node lead unit.
367b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
368b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    class BranchHeadNode : public ValueNode {
369b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    public:
370b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        BranchHeadNode(int32_t len, Node *subNode)
371b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
372b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                  length(len), next(subNode) {}
373b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual UBool operator==(const Node &other) const;
374b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
375b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        virtual void write(StringTrieBuilder &builder);
376b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    protected:
377b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        int32_t length;
378b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho        Node *next;  // A branch sub-node.
379b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    };
380103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#endif  /* U_HIDE_INTERNAL_API */
381b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
382b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
383b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
384b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                        Node *nextNode) const = 0;
385b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
386b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
387b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t write(int32_t unit) = 0;
388b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
389b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
390b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
391b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
392b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
393b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
394b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    /** @internal */
395b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
396b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho};
397b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
398b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoU_NAMESPACE_END
399b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
400b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif  // __STRINGTRIEBUILDER_H__
401