1/*
2*******************************************************************************
3*   Copyright (C) 2010-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  bytestriebuilder.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010sep25
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/bytestrie.h"
17#include "unicode/bytestriebuilder.h"
18#include "unicode/stringpiece.h"
19#include "charstr.h"
20#include "cmemory.h"
21#include "uhash.h"
22#include "uarrsort.h"
23#include "uassert.h"
24
25U_NAMESPACE_BEGIN
26
27/*
28 * Note: This builder implementation stores (bytes, value) pairs with full copies
29 * of the byte sequences, until the BytesTrie is built.
30 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
31 */
32
33class BytesTrieElement : public UMemory {
34public:
35    // Use compiler's default constructor, initializes nothing.
36
37    void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
38
39    StringPiece getString(const CharString &strings) const {
40        int32_t offset=stringOffset;
41        int32_t length;
42        if(offset>=0) {
43            length=(uint8_t)strings[offset++];
44        } else {
45            offset=~offset;
46            length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
47            offset+=2;
48        }
49        return StringPiece(strings.data()+offset, length);
50    }
51    int32_t getStringLength(const CharString &strings) const {
52        int32_t offset=stringOffset;
53        if(offset>=0) {
54            return (uint8_t)strings[offset];
55        } else {
56            offset=~offset;
57            return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
58        }
59    }
60
61    char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
62
63    int32_t getValue() const { return value; }
64
65    int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
66
67private:
68    const char *data(const CharString &strings) const {
69        int32_t offset=stringOffset;
70        if(offset>=0) {
71            ++offset;
72        } else {
73            offset=~offset+2;
74        }
75        return strings.data()+offset;
76    }
77
78    // If the stringOffset is non-negative, then the first strings byte contains
79    // the string length.
80    // If the stringOffset is negative, then the first two strings bytes contain
81    // the string length (big-endian), and the offset needs to be bit-inverted.
82    // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
83    int32_t stringOffset;
84    int32_t value;
85};
86
87void
88BytesTrieElement::setTo(const StringPiece &s, int32_t val,
89                        CharString &strings, UErrorCode &errorCode) {
90    if(U_FAILURE(errorCode)) {
91        return;
92    }
93    int32_t length=s.length();
94    if(length>0xffff) {
95        // Too long: We store the length in 1 or 2 bytes.
96        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
97        return;
98    }
99    int32_t offset=strings.length();
100    if(length>0xff) {
101        offset=~offset;
102        strings.append((char)(length>>8), errorCode);
103    }
104    strings.append((char)length, errorCode);
105    stringOffset=offset;
106    value=val;
107    strings.append(s, errorCode);
108}
109
110int32_t
111BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
112    // TODO: add StringPiece::compare(), see ticket #8187
113    StringPiece thisString=getString(strings);
114    StringPiece otherString=other.getString(strings);
115    int32_t lengthDiff=thisString.length()-otherString.length();
116    int32_t commonLength;
117    if(lengthDiff<=0) {
118        commonLength=thisString.length();
119    } else {
120        commonLength=otherString.length();
121    }
122    int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
123    return diff!=0 ? diff : lengthDiff;
124}
125
126BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
127        : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
128          bytes(NULL), bytesCapacity(0), bytesLength(0) {
129    if(U_FAILURE(errorCode)) {
130        return;
131    }
132    strings=new CharString();
133    if(strings==NULL) {
134        errorCode=U_MEMORY_ALLOCATION_ERROR;
135    }
136}
137
138BytesTrieBuilder::~BytesTrieBuilder() {
139    delete strings;
140    delete[] elements;
141    uprv_free(bytes);
142}
143
144BytesTrieBuilder &
145BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
146    if(U_FAILURE(errorCode)) {
147        return *this;
148    }
149    if(bytesLength>0) {
150        // Cannot add elements after building.
151        errorCode=U_NO_WRITE_PERMISSION;
152        return *this;
153    }
154    if(elementsLength==elementsCapacity) {
155        int32_t newCapacity;
156        if(elementsCapacity==0) {
157            newCapacity=1024;
158        } else {
159            newCapacity=4*elementsCapacity;
160        }
161        BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
162        if(newElements==NULL) {
163            errorCode=U_MEMORY_ALLOCATION_ERROR;
164        }
165        if(elementsLength>0) {
166            uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
167        }
168        delete[] elements;
169        elements=newElements;
170        elementsCapacity=newCapacity;
171    }
172    elements[elementsLength++].setTo(s, value, *strings, errorCode);
173    return *this;
174}
175
176U_CDECL_BEGIN
177
178static int32_t U_CALLCONV
179compareElementStrings(const void *context, const void *left, const void *right) {
180    const CharString *strings=reinterpret_cast<const CharString *>(context);
181    const BytesTrieElement *leftElement=reinterpret_cast<const BytesTrieElement *>(left);
182    const BytesTrieElement *rightElement=reinterpret_cast<const BytesTrieElement *>(right);
183    return leftElement->compareStringTo(*rightElement, *strings);
184}
185
186U_CDECL_END
187
188BytesTrie *
189BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
190    buildBytes(buildOption, errorCode);
191    BytesTrie *newTrie=NULL;
192    if(U_SUCCESS(errorCode)) {
193        newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
194        if(newTrie==NULL) {
195            errorCode=U_MEMORY_ALLOCATION_ERROR;
196        } else {
197            bytes=NULL;  // The new trie now owns the array.
198            bytesCapacity=0;
199        }
200    }
201    return newTrie;
202}
203
204StringPiece
205BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
206    buildBytes(buildOption, errorCode);
207    StringPiece result;
208    if(U_SUCCESS(errorCode)) {
209        result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
210    }
211    return result;
212}
213
214void
215BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
216    if(U_FAILURE(errorCode)) {
217        return;
218    }
219    if(bytes!=NULL && bytesLength>0) {
220        // Already built.
221        return;
222    }
223    if(bytesLength==0) {
224        if(elementsLength==0) {
225            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
226            return;
227        }
228        uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
229                      compareElementStrings, strings,
230                      FALSE,  // need not be a stable sort
231                      &errorCode);
232        if(U_FAILURE(errorCode)) {
233            return;
234        }
235        // Duplicate strings are not allowed.
236        StringPiece prev=elements[0].getString(*strings);
237        for(int32_t i=1; i<elementsLength; ++i) {
238            StringPiece current=elements[i].getString(*strings);
239            if(prev==current) {
240                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
241                return;
242            }
243            prev=current;
244        }
245    }
246    // Create and byte-serialize the trie for the elements.
247    bytesLength=0;
248    int32_t capacity=strings->length();
249    if(capacity<1024) {
250        capacity=1024;
251    }
252    if(bytesCapacity<capacity) {
253        uprv_free(bytes);
254        bytes=reinterpret_cast<char *>(uprv_malloc(capacity));
255        if(bytes==NULL) {
256            errorCode=U_MEMORY_ALLOCATION_ERROR;
257            bytesCapacity=0;
258            return;
259        }
260        bytesCapacity=capacity;
261    }
262    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
263    if(bytes==NULL) {
264        errorCode=U_MEMORY_ALLOCATION_ERROR;
265    }
266}
267
268BytesTrieBuilder &
269BytesTrieBuilder::clear() {
270    strings->clear();
271    elementsLength=0;
272    bytesLength=0;
273    return *this;
274}
275
276int32_t
277BytesTrieBuilder::getElementStringLength(int32_t i) const {
278    return elements[i].getStringLength(*strings);
279}
280
281UChar
282BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
283    return (uint8_t)elements[i].charAt(byteIndex, *strings);
284}
285
286int32_t
287BytesTrieBuilder::getElementValue(int32_t i) const {
288    return elements[i].getValue();
289}
290
291int32_t
292BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
293    const BytesTrieElement &firstElement=elements[first];
294    const BytesTrieElement &lastElement=elements[last];
295    int32_t minStringLength=firstElement.getStringLength(*strings);
296    while(++byteIndex<minStringLength &&
297            firstElement.charAt(byteIndex, *strings)==
298            lastElement.charAt(byteIndex, *strings)) {}
299    return byteIndex;
300}
301
302int32_t
303BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
304    int32_t length=0;  // Number of different bytes at byteIndex.
305    int32_t i=start;
306    do {
307        char byte=elements[i++].charAt(byteIndex, *strings);
308        while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
309            ++i;
310        }
311        ++length;
312    } while(i<limit);
313    return length;
314}
315
316int32_t
317BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
318    do {
319        char byte=elements[i++].charAt(byteIndex, *strings);
320        while(byte==elements[i].charAt(byteIndex, *strings)) {
321            ++i;
322        }
323    } while(--count>0);
324    return i;
325}
326
327int32_t
328BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
329    char b=(char)byte;
330    while(b==elements[i].charAt(byteIndex, *strings)) {
331        ++i;
332    }
333    return i;
334}
335
336BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
337        : LinearMatchNode(len, nextNode), s(bytes) {
338    hash=hash*37+uhash_hashCharsN(bytes, len);
339}
340
341UBool
342BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
343    if(this==&other) {
344        return TRUE;
345    }
346    if(!LinearMatchNode::operator==(other)) {
347        return FALSE;
348    }
349    const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
350    return 0==uprv_memcmp(s, o.s, length);
351}
352
353void
354BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
355    BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
356    next->write(builder);
357    b.write(s, length);
358    offset=b.write(b.getMinLinearMatch()+length-1);
359}
360
361StringTrieBuilder::Node *
362BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
363                                        Node *nextNode) const {
364    return new BTLinearMatchNode(
365            elements[i].getString(*strings).data()+byteIndex,
366            length,
367            nextNode);
368}
369
370UBool
371BytesTrieBuilder::ensureCapacity(int32_t length) {
372    if(bytes==NULL) {
373        return FALSE;  // previous memory allocation had failed
374    }
375    if(length>bytesCapacity) {
376        int32_t newCapacity=bytesCapacity;
377        do {
378            newCapacity*=2;
379        } while(newCapacity<=length);
380        char *newBytes=reinterpret_cast<char *>(uprv_malloc(newCapacity));
381        if(newBytes==NULL) {
382            // unable to allocate memory
383            uprv_free(bytes);
384            bytes=NULL;
385            bytesCapacity=0;
386            return FALSE;
387        }
388        uprv_memcpy(newBytes+(newCapacity-bytesLength),
389                    bytes+(bytesCapacity-bytesLength), bytesLength);
390        uprv_free(bytes);
391        bytes=newBytes;
392        bytesCapacity=newCapacity;
393    }
394    return TRUE;
395}
396
397int32_t
398BytesTrieBuilder::write(int32_t byte) {
399    int32_t newLength=bytesLength+1;
400    if(ensureCapacity(newLength)) {
401        bytesLength=newLength;
402        bytes[bytesCapacity-bytesLength]=(char)byte;
403    }
404    return bytesLength;
405}
406
407int32_t
408BytesTrieBuilder::write(const char *b, int32_t length) {
409    int32_t newLength=bytesLength+length;
410    if(ensureCapacity(newLength)) {
411        bytesLength=newLength;
412        uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
413    }
414    return bytesLength;
415}
416
417int32_t
418BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
419    return write(elements[i].getString(*strings).data()+byteIndex, length);
420}
421
422int32_t
423BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
424    if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
425        return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
426    }
427    char intBytes[5];
428    int32_t length=1;
429    if(i<0 || i>0xffffff) {
430        intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
431        intBytes[1]=(char)(i>>24);
432        intBytes[2]=(char)(i>>16);
433        intBytes[3]=(char)(i>>8);
434        intBytes[4]=(char)i;
435        length=5;
436    // } else if(i<=BytesTrie::kMaxOneByteValue) {
437    //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
438    } else {
439        if(i<=BytesTrie::kMaxTwoByteValue) {
440            intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
441        } else {
442            if(i<=BytesTrie::kMaxThreeByteValue) {
443                intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
444            } else {
445                intBytes[0]=(char)BytesTrie::kFourByteValueLead;
446                intBytes[1]=(char)(i>>16);
447                length=2;
448            }
449            intBytes[length++]=(char)(i>>8);
450        }
451        intBytes[length++]=(char)i;
452    }
453    intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
454    return write(intBytes, length);
455}
456
457int32_t
458BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
459    int32_t offset=write(node);
460    if(hasValue) {
461        offset=writeValueAndFinal(value, FALSE);
462    }
463    return offset;
464}
465
466int32_t
467BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
468    int32_t i=bytesLength-jumpTarget;
469    U_ASSERT(i>=0);
470    if(i<=BytesTrie::kMaxOneByteDelta) {
471        return write(i);
472    }
473    char intBytes[5];
474    int32_t length;
475    if(i<=BytesTrie::kMaxTwoByteDelta) {
476        intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
477        length=1;
478    } else {
479        if(i<=BytesTrie::kMaxThreeByteDelta) {
480            intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
481            length=2;
482        } else {
483            if(i<=0xffffff) {
484                intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
485                length=3;
486            } else {
487                intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
488                intBytes[1]=(char)(i>>24);
489                length=4;
490            }
491            intBytes[1]=(char)(i>>16);
492        }
493        intBytes[1]=(char)(i>>8);
494    }
495    intBytes[length++]=(char)i;
496    return write(intBytes, length);
497}
498
499U_NAMESPACE_END
500