1/*
2*******************************************************************************
3*   Copyright (C) 2010-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  bytestriebuilder.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010sep25
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/bytestrie.h"
17#include "unicode/bytestriebuilder.h"
18#include "unicode/stringpiece.h"
19#include "charstr.h"
20#include "cmemory.h"
21#include "uhash.h"
22#include "uarrsort.h"
23#include "uassert.h"
24#include "ustr_imp.h"
25
26U_NAMESPACE_BEGIN
27
28/*
29 * Note: This builder implementation stores (bytes, value) pairs with full copies
30 * of the byte sequences, until the BytesTrie is built.
31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
32 */
33
34class BytesTrieElement : public UMemory {
35public:
36    // Use compiler's default constructor, initializes nothing.
37
38    void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
39
40    StringPiece getString(const CharString &strings) const {
41        int32_t offset=stringOffset;
42        int32_t length;
43        if(offset>=0) {
44            length=(uint8_t)strings[offset++];
45        } else {
46            offset=~offset;
47            length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
48            offset+=2;
49        }
50        return StringPiece(strings.data()+offset, length);
51    }
52    int32_t getStringLength(const CharString &strings) const {
53        int32_t offset=stringOffset;
54        if(offset>=0) {
55            return (uint8_t)strings[offset];
56        } else {
57            offset=~offset;
58            return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
59        }
60    }
61
62    char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
63
64    int32_t getValue() const { return value; }
65
66    int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
67
68private:
69    const char *data(const CharString &strings) const {
70        int32_t offset=stringOffset;
71        if(offset>=0) {
72            ++offset;
73        } else {
74            offset=~offset+2;
75        }
76        return strings.data()+offset;
77    }
78
79    // If the stringOffset is non-negative, then the first strings byte contains
80    // the string length.
81    // If the stringOffset is negative, then the first two strings bytes contain
82    // the string length (big-endian), and the offset needs to be bit-inverted.
83    // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
84    int32_t stringOffset;
85    int32_t value;
86};
87
88void
89BytesTrieElement::setTo(const StringPiece &s, int32_t val,
90                        CharString &strings, UErrorCode &errorCode) {
91    if(U_FAILURE(errorCode)) {
92        return;
93    }
94    int32_t length=s.length();
95    if(length>0xffff) {
96        // Too long: We store the length in 1 or 2 bytes.
97        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
98        return;
99    }
100    int32_t offset=strings.length();
101    if(length>0xff) {
102        offset=~offset;
103        strings.append((char)(length>>8), errorCode);
104    }
105    strings.append((char)length, errorCode);
106    stringOffset=offset;
107    value=val;
108    strings.append(s, errorCode);
109}
110
111int32_t
112BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
113    // TODO: add StringPiece::compare(), see ticket #8187
114    StringPiece thisString=getString(strings);
115    StringPiece otherString=other.getString(strings);
116    int32_t lengthDiff=thisString.length()-otherString.length();
117    int32_t commonLength;
118    if(lengthDiff<=0) {
119        commonLength=thisString.length();
120    } else {
121        commonLength=otherString.length();
122    }
123    int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
124    return diff!=0 ? diff : lengthDiff;
125}
126
127BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
128        : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
129          bytes(NULL), bytesCapacity(0), bytesLength(0) {
130    if(U_FAILURE(errorCode)) {
131        return;
132    }
133    strings=new CharString();
134    if(strings==NULL) {
135        errorCode=U_MEMORY_ALLOCATION_ERROR;
136    }
137}
138
139BytesTrieBuilder::~BytesTrieBuilder() {
140    delete strings;
141    delete[] elements;
142    uprv_free(bytes);
143}
144
145BytesTrieBuilder &
146BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
147    if(U_FAILURE(errorCode)) {
148        return *this;
149    }
150    if(bytesLength>0) {
151        // Cannot add elements after building.
152        errorCode=U_NO_WRITE_PERMISSION;
153        return *this;
154    }
155    if(elementsLength==elementsCapacity) {
156        int32_t newCapacity;
157        if(elementsCapacity==0) {
158            newCapacity=1024;
159        } else {
160            newCapacity=4*elementsCapacity;
161        }
162        BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
163        if(newElements==NULL) {
164            errorCode=U_MEMORY_ALLOCATION_ERROR;
165            return *this; // error instead of dereferencing null
166        }
167        if(elementsLength>0) {
168            uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
169        }
170        delete[] elements;
171        elements=newElements;
172        elementsCapacity=newCapacity;
173    }
174    elements[elementsLength++].setTo(s, value, *strings, errorCode);
175    return *this;
176}
177
178U_CDECL_BEGIN
179
180static int32_t U_CALLCONV
181compareElementStrings(const void *context, const void *left, const void *right) {
182    const CharString *strings=static_cast<const CharString *>(context);
183    const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
184    const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
185    return leftElement->compareStringTo(*rightElement, *strings);
186}
187
188U_CDECL_END
189
190BytesTrie *
191BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
192    buildBytes(buildOption, errorCode);
193    BytesTrie *newTrie=NULL;
194    if(U_SUCCESS(errorCode)) {
195        newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
196        if(newTrie==NULL) {
197            errorCode=U_MEMORY_ALLOCATION_ERROR;
198        } else {
199            bytes=NULL;  // The new trie now owns the array.
200            bytesCapacity=0;
201        }
202    }
203    return newTrie;
204}
205
206StringPiece
207BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
208    buildBytes(buildOption, errorCode);
209    StringPiece result;
210    if(U_SUCCESS(errorCode)) {
211        result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
212    }
213    return result;
214}
215
216void
217BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
218    if(U_FAILURE(errorCode)) {
219        return;
220    }
221    if(bytes!=NULL && bytesLength>0) {
222        // Already built.
223        return;
224    }
225    if(bytesLength==0) {
226        if(elementsLength==0) {
227            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
228            return;
229        }
230        uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
231                      compareElementStrings, strings,
232                      FALSE,  // need not be a stable sort
233                      &errorCode);
234        if(U_FAILURE(errorCode)) {
235            return;
236        }
237        // Duplicate strings are not allowed.
238        StringPiece prev=elements[0].getString(*strings);
239        for(int32_t i=1; i<elementsLength; ++i) {
240            StringPiece current=elements[i].getString(*strings);
241            if(prev==current) {
242                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
243                return;
244            }
245            prev=current;
246        }
247    }
248    // Create and byte-serialize the trie for the elements.
249    bytesLength=0;
250    int32_t capacity=strings->length();
251    if(capacity<1024) {
252        capacity=1024;
253    }
254    if(bytesCapacity<capacity) {
255        uprv_free(bytes);
256        bytes=static_cast<char *>(uprv_malloc(capacity));
257        if(bytes==NULL) {
258            errorCode=U_MEMORY_ALLOCATION_ERROR;
259            bytesCapacity=0;
260            return;
261        }
262        bytesCapacity=capacity;
263    }
264    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
265    if(bytes==NULL) {
266        errorCode=U_MEMORY_ALLOCATION_ERROR;
267    }
268}
269
270BytesTrieBuilder &
271BytesTrieBuilder::clear() {
272    strings->clear();
273    elementsLength=0;
274    bytesLength=0;
275    return *this;
276}
277
278int32_t
279BytesTrieBuilder::getElementStringLength(int32_t i) const {
280    return elements[i].getStringLength(*strings);
281}
282
283UChar
284BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
285    return (uint8_t)elements[i].charAt(byteIndex, *strings);
286}
287
288int32_t
289BytesTrieBuilder::getElementValue(int32_t i) const {
290    return elements[i].getValue();
291}
292
293int32_t
294BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
295    const BytesTrieElement &firstElement=elements[first];
296    const BytesTrieElement &lastElement=elements[last];
297    int32_t minStringLength=firstElement.getStringLength(*strings);
298    while(++byteIndex<minStringLength &&
299            firstElement.charAt(byteIndex, *strings)==
300            lastElement.charAt(byteIndex, *strings)) {}
301    return byteIndex;
302}
303
304int32_t
305BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
306    int32_t length=0;  // Number of different bytes at byteIndex.
307    int32_t i=start;
308    do {
309        char byte=elements[i++].charAt(byteIndex, *strings);
310        while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
311            ++i;
312        }
313        ++length;
314    } while(i<limit);
315    return length;
316}
317
318int32_t
319BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
320    do {
321        char byte=elements[i++].charAt(byteIndex, *strings);
322        while(byte==elements[i].charAt(byteIndex, *strings)) {
323            ++i;
324        }
325    } while(--count>0);
326    return i;
327}
328
329int32_t
330BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
331    char b=(char)byte;
332    while(b==elements[i].charAt(byteIndex, *strings)) {
333        ++i;
334    }
335    return i;
336}
337
338BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
339        : LinearMatchNode(len, nextNode), s(bytes) {
340    hash=hash*37+ustr_hashCharsN(bytes, len);
341}
342
343UBool
344BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
345    if(this==&other) {
346        return TRUE;
347    }
348    if(!LinearMatchNode::operator==(other)) {
349        return FALSE;
350    }
351    const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
352    return 0==uprv_memcmp(s, o.s, length);
353}
354
355void
356BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
357    BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
358    next->write(builder);
359    b.write(s, length);
360    offset=b.write(b.getMinLinearMatch()+length-1);
361}
362
363StringTrieBuilder::Node *
364BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
365                                        Node *nextNode) const {
366    return new BTLinearMatchNode(
367            elements[i].getString(*strings).data()+byteIndex,
368            length,
369            nextNode);
370}
371
372UBool
373BytesTrieBuilder::ensureCapacity(int32_t length) {
374    if(bytes==NULL) {
375        return FALSE;  // previous memory allocation had failed
376    }
377    if(length>bytesCapacity) {
378        int32_t newCapacity=bytesCapacity;
379        do {
380            newCapacity*=2;
381        } while(newCapacity<=length);
382        char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
383        if(newBytes==NULL) {
384            // unable to allocate memory
385            uprv_free(bytes);
386            bytes=NULL;
387            bytesCapacity=0;
388            return FALSE;
389        }
390        uprv_memcpy(newBytes+(newCapacity-bytesLength),
391                    bytes+(bytesCapacity-bytesLength), bytesLength);
392        uprv_free(bytes);
393        bytes=newBytes;
394        bytesCapacity=newCapacity;
395    }
396    return TRUE;
397}
398
399int32_t
400BytesTrieBuilder::write(int32_t byte) {
401    int32_t newLength=bytesLength+1;
402    if(ensureCapacity(newLength)) {
403        bytesLength=newLength;
404        bytes[bytesCapacity-bytesLength]=(char)byte;
405    }
406    return bytesLength;
407}
408
409int32_t
410BytesTrieBuilder::write(const char *b, int32_t length) {
411    int32_t newLength=bytesLength+length;
412    if(ensureCapacity(newLength)) {
413        bytesLength=newLength;
414        uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
415    }
416    return bytesLength;
417}
418
419int32_t
420BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
421    return write(elements[i].getString(*strings).data()+byteIndex, length);
422}
423
424int32_t
425BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
426    if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
427        return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
428    }
429    char intBytes[5];
430    int32_t length=1;
431    if(i<0 || i>0xffffff) {
432        intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
433        intBytes[1]=(char)((uint32_t)i>>24);
434        intBytes[2]=(char)((uint32_t)i>>16);
435        intBytes[3]=(char)((uint32_t)i>>8);
436        intBytes[4]=(char)i;
437        length=5;
438    // } else if(i<=BytesTrie::kMaxOneByteValue) {
439    //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
440    } else {
441        if(i<=BytesTrie::kMaxTwoByteValue) {
442            intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
443        } else {
444            if(i<=BytesTrie::kMaxThreeByteValue) {
445                intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
446            } else {
447                intBytes[0]=(char)BytesTrie::kFourByteValueLead;
448                intBytes[1]=(char)(i>>16);
449                length=2;
450            }
451            intBytes[length++]=(char)(i>>8);
452        }
453        intBytes[length++]=(char)i;
454    }
455    intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
456    return write(intBytes, length);
457}
458
459int32_t
460BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
461    int32_t offset=write(node);
462    if(hasValue) {
463        offset=writeValueAndFinal(value, FALSE);
464    }
465    return offset;
466}
467
468int32_t
469BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
470    int32_t i=bytesLength-jumpTarget;
471    U_ASSERT(i>=0);
472    if(i<=BytesTrie::kMaxOneByteDelta) {
473        return write(i);
474    }
475    char intBytes[5];
476    int32_t length;
477    if(i<=BytesTrie::kMaxTwoByteDelta) {
478        intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
479        length=1;
480    } else {
481        if(i<=BytesTrie::kMaxThreeByteDelta) {
482            intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
483            length=2;
484        } else {
485            if(i<=0xffffff) {
486                intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
487                length=3;
488            } else {
489                intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
490                intBytes[1]=(char)(i>>24);
491                length=4;
492            }
493            intBytes[1]=(char)(i>>16);
494        }
495        intBytes[1]=(char)(i>>8);
496    }
497    intBytes[length++]=(char)i;
498    return write(intBytes, length);
499}
500
501U_NAMESPACE_END
502