1/*
2*******************************************************************************
3*   Copyright (C) 2010-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ucharstriebuilder.h
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010nov14
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/ucharstrie.h"
17#include "unicode/ucharstriebuilder.h"
18#include "unicode/unistr.h"
19#include "unicode/ustring.h"
20#include "cmemory.h"
21#include "uarrsort.h"
22#include "uassert.h"
23#include "uhash.h"
24#include "ustr_imp.h"
25
26U_NAMESPACE_BEGIN
27
28/*
29 * Note: This builder implementation stores (string, value) pairs with full copies
30 * of the 16-bit-unit sequences, until the UCharsTrie is built.
31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
32 */
33
34class UCharsTrieElement : public UMemory {
35public:
36    // Use compiler's default constructor, initializes nothing.
37
38    void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
39
40    UnicodeString getString(const UnicodeString &strings) const {
41        int32_t length=strings[stringOffset];
42        return strings.tempSubString(stringOffset+1, length);
43    }
44    int32_t getStringLength(const UnicodeString &strings) const {
45        return strings[stringOffset];
46    }
47
48    UChar charAt(int32_t index, const UnicodeString &strings) const {
49        return strings[stringOffset+1+index];
50    }
51
52    int32_t getValue() const { return value; }
53
54    int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
55
56private:
57    // The first strings unit contains the string length.
58    // (Compared with a stringLength field here, this saves 2 bytes per string.)
59    int32_t stringOffset;
60    int32_t value;
61};
62
63void
64UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
65                         UnicodeString &strings, UErrorCode &errorCode) {
66    if(U_FAILURE(errorCode)) {
67        return;
68    }
69    int32_t length=s.length();
70    if(length>0xffff) {
71        // Too long: We store the length in 1 unit.
72        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
73        return;
74    }
75    stringOffset=strings.length();
76    strings.append((UChar)length);
77    value=val;
78    strings.append(s);
79}
80
81int32_t
82UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
83    return getString(strings).compare(other.getString(strings));
84}
85
86UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
87        : elements(NULL), elementsCapacity(0), elementsLength(0),
88          uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
89
90UCharsTrieBuilder::~UCharsTrieBuilder() {
91    delete[] elements;
92    uprv_free(uchars);
93}
94
95UCharsTrieBuilder &
96UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
97    if(U_FAILURE(errorCode)) {
98        return *this;
99    }
100    if(ucharsLength>0) {
101        // Cannot add elements after building.
102        errorCode=U_NO_WRITE_PERMISSION;
103        return *this;
104    }
105    if(elementsLength==elementsCapacity) {
106        int32_t newCapacity;
107        if(elementsCapacity==0) {
108            newCapacity=1024;
109        } else {
110            newCapacity=4*elementsCapacity;
111        }
112        UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
113        if(newElements==NULL) {
114            errorCode=U_MEMORY_ALLOCATION_ERROR;
115            return *this;
116        }
117        if(elementsLength>0) {
118            uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement));
119        }
120        delete[] elements;
121        elements=newElements;
122        elementsCapacity=newCapacity;
123    }
124    elements[elementsLength++].setTo(s, value, strings, errorCode);
125    if(U_SUCCESS(errorCode) && strings.isBogus()) {
126        errorCode=U_MEMORY_ALLOCATION_ERROR;
127    }
128    return *this;
129}
130
131U_CDECL_BEGIN
132
133static int32_t U_CALLCONV
134compareElementStrings(const void *context, const void *left, const void *right) {
135    const UnicodeString *strings=static_cast<const UnicodeString *>(context);
136    const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
137    const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
138    return leftElement->compareStringTo(*rightElement, *strings);
139}
140
141U_CDECL_END
142
143UCharsTrie *
144UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
145    buildUChars(buildOption, errorCode);
146    UCharsTrie *newTrie=NULL;
147    if(U_SUCCESS(errorCode)) {
148        newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
149        if(newTrie==NULL) {
150            errorCode=U_MEMORY_ALLOCATION_ERROR;
151        } else {
152            uchars=NULL;  // The new trie now owns the array.
153            ucharsCapacity=0;
154        }
155    }
156    return newTrie;
157}
158
159UnicodeString &
160UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
161                                      UErrorCode &errorCode) {
162    buildUChars(buildOption, errorCode);
163    if(U_SUCCESS(errorCode)) {
164        result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
165    }
166    return result;
167}
168
169void
170UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
171    if(U_FAILURE(errorCode)) {
172        return;
173    }
174    if(uchars!=NULL && ucharsLength>0) {
175        // Already built.
176        return;
177    }
178    if(ucharsLength==0) {
179        if(elementsLength==0) {
180            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
181            return;
182        }
183        if(strings.isBogus()) {
184            errorCode=U_MEMORY_ALLOCATION_ERROR;
185            return;
186        }
187        uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
188                      compareElementStrings, &strings,
189                      FALSE,  // need not be a stable sort
190                      &errorCode);
191        if(U_FAILURE(errorCode)) {
192            return;
193        }
194        // Duplicate strings are not allowed.
195        UnicodeString prev=elements[0].getString(strings);
196        for(int32_t i=1; i<elementsLength; ++i) {
197            UnicodeString current=elements[i].getString(strings);
198            if(prev==current) {
199                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
200                return;
201            }
202            prev.fastCopyFrom(current);
203        }
204    }
205    // Create and UChar-serialize the trie for the elements.
206    ucharsLength=0;
207    int32_t capacity=strings.length();
208    if(capacity<1024) {
209        capacity=1024;
210    }
211    if(ucharsCapacity<capacity) {
212        uprv_free(uchars);
213        uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
214        if(uchars==NULL) {
215            errorCode=U_MEMORY_ALLOCATION_ERROR;
216            ucharsCapacity=0;
217            return;
218        }
219        ucharsCapacity=capacity;
220    }
221    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
222    if(uchars==NULL) {
223        errorCode=U_MEMORY_ALLOCATION_ERROR;
224    }
225}
226
227int32_t
228UCharsTrieBuilder::getElementStringLength(int32_t i) const {
229    return elements[i].getStringLength(strings);
230}
231
232UChar
233UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
234    return elements[i].charAt(unitIndex, strings);
235}
236
237int32_t
238UCharsTrieBuilder::getElementValue(int32_t i) const {
239    return elements[i].getValue();
240}
241
242int32_t
243UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
244    const UCharsTrieElement &firstElement=elements[first];
245    const UCharsTrieElement &lastElement=elements[last];
246    int32_t minStringLength=firstElement.getStringLength(strings);
247    while(++unitIndex<minStringLength &&
248            firstElement.charAt(unitIndex, strings)==
249            lastElement.charAt(unitIndex, strings)) {}
250    return unitIndex;
251}
252
253int32_t
254UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
255    int32_t length=0;  // Number of different units at unitIndex.
256    int32_t i=start;
257    do {
258        UChar unit=elements[i++].charAt(unitIndex, strings);
259        while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
260            ++i;
261        }
262        ++length;
263    } while(i<limit);
264    return length;
265}
266
267int32_t
268UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
269    do {
270        UChar unit=elements[i++].charAt(unitIndex, strings);
271        while(unit==elements[i].charAt(unitIndex, strings)) {
272            ++i;
273        }
274    } while(--count>0);
275    return i;
276}
277
278int32_t
279UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
280    while(unit==elements[i].charAt(unitIndex, strings)) {
281        ++i;
282    }
283    return i;
284}
285
286UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
287        : LinearMatchNode(len, nextNode), s(units) {
288    hash=hash*37+ustr_hashUCharsN(units, len);
289}
290
291UBool
292UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
293    if(this==&other) {
294        return TRUE;
295    }
296    if(!LinearMatchNode::operator==(other)) {
297        return FALSE;
298    }
299    const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
300    return 0==u_memcmp(s, o.s, length);
301}
302
303void
304UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
305    UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
306    next->write(builder);
307    b.write(s, length);
308    offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
309}
310
311StringTrieBuilder::Node *
312UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
313                                         Node *nextNode) const {
314    return new UCTLinearMatchNode(
315            elements[i].getString(strings).getBuffer()+unitIndex,
316            length,
317            nextNode);
318}
319
320UBool
321UCharsTrieBuilder::ensureCapacity(int32_t length) {
322    if(uchars==NULL) {
323        return FALSE;  // previous memory allocation had failed
324    }
325    if(length>ucharsCapacity) {
326        int32_t newCapacity=ucharsCapacity;
327        do {
328            newCapacity*=2;
329        } while(newCapacity<=length);
330        UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
331        if(newUChars==NULL) {
332            // unable to allocate memory
333            uprv_free(uchars);
334            uchars=NULL;
335            ucharsCapacity=0;
336            return FALSE;
337        }
338        u_memcpy(newUChars+(newCapacity-ucharsLength),
339                 uchars+(ucharsCapacity-ucharsLength), ucharsLength);
340        uprv_free(uchars);
341        uchars=newUChars;
342        ucharsCapacity=newCapacity;
343    }
344    return TRUE;
345}
346
347int32_t
348UCharsTrieBuilder::write(int32_t unit) {
349    int32_t newLength=ucharsLength+1;
350    if(ensureCapacity(newLength)) {
351        ucharsLength=newLength;
352        uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
353    }
354    return ucharsLength;
355}
356
357int32_t
358UCharsTrieBuilder::write(const UChar *s, int32_t length) {
359    int32_t newLength=ucharsLength+length;
360    if(ensureCapacity(newLength)) {
361        ucharsLength=newLength;
362        u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
363    }
364    return ucharsLength;
365}
366
367int32_t
368UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
369    return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
370}
371
372int32_t
373UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
374    if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
375        return write(i|(isFinal<<15));
376    }
377    UChar intUnits[3];
378    int32_t length;
379    if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
380        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
381        intUnits[1]=(UChar)((uint32_t)i>>16);
382        intUnits[2]=(UChar)i;
383        length=3;
384    // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
385    //     intUnits[0]=(UChar)(i);
386    //     length=1;
387    } else {
388        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
389        intUnits[1]=(UChar)i;
390        length=2;
391    }
392    intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
393    return write(intUnits, length);
394}
395
396int32_t
397UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
398    if(!hasValue) {
399        return write(node);
400    }
401    UChar intUnits[3];
402    int32_t length;
403    if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
404        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
405        intUnits[1]=(UChar)((uint32_t)value>>16);
406        intUnits[2]=(UChar)value;
407        length=3;
408    } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
409        intUnits[0]=(UChar)((value+1)<<6);
410        length=1;
411    } else {
412        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
413        intUnits[1]=(UChar)value;
414        length=2;
415    }
416    intUnits[0]|=(UChar)node;
417    return write(intUnits, length);
418}
419
420int32_t
421UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
422    int32_t i=ucharsLength-jumpTarget;
423    U_ASSERT(i>=0);
424    if(i<=UCharsTrie::kMaxOneUnitDelta) {
425        return write(i);
426    }
427    UChar intUnits[3];
428    int32_t length;
429    if(i<=UCharsTrie::kMaxTwoUnitDelta) {
430        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
431        length=1;
432    } else {
433        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
434        intUnits[1]=(UChar)(i>>16);
435        length=2;
436    }
437    intUnits[length++]=(UChar)i;
438    return write(intUnits, length);
439}
440
441U_NAMESPACE_END
442