1/* 2******************************************************************************* 3* Copyright (C) 2010-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: ucharstriebuilder.h 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* created on: 2010nov14 12* created by: Markus W. Scherer 13*/ 14 15#include "unicode/utypes.h" 16#include "unicode/ucharstrie.h" 17#include "unicode/ucharstriebuilder.h" 18#include "unicode/unistr.h" 19#include "unicode/ustring.h" 20#include "cmemory.h" 21#include "uarrsort.h" 22#include "uassert.h" 23#include "uhash.h" 24 25U_NAMESPACE_BEGIN 26 27/* 28 * Note: This builder implementation stores (string, value) pairs with full copies 29 * of the 16-bit-unit sequences, until the UCharsTrie is built. 30 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 31 */ 32 33class UCharsTrieElement : public UMemory { 34public: 35 // Use compiler's default constructor, initializes nothing. 36 37 void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode); 38 39 UnicodeString getString(const UnicodeString &strings) const { 40 int32_t length=strings[stringOffset]; 41 return strings.tempSubString(stringOffset+1, length); 42 } 43 int32_t getStringLength(const UnicodeString &strings) const { 44 return strings[stringOffset]; 45 } 46 47 UChar charAt(int32_t index, const UnicodeString &strings) const { 48 return strings[stringOffset+1+index]; 49 } 50 51 int32_t getValue() const { return value; } 52 53 int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const; 54 55private: 56 // The first strings unit contains the string length. 57 // (Compared with a stringLength field here, this saves 2 bytes per string.) 58 int32_t stringOffset; 59 int32_t value; 60}; 61 62void 63UCharsTrieElement::setTo(const UnicodeString &s, int32_t val, 64 UnicodeString &strings, UErrorCode &errorCode) { 65 if(U_FAILURE(errorCode)) { 66 return; 67 } 68 int32_t length=s.length(); 69 if(length>0xffff) { 70 // Too long: We store the length in 1 unit. 71 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 72 return; 73 } 74 stringOffset=strings.length(); 75 strings.append((UChar)length); 76 value=val; 77 strings.append(s); 78} 79 80int32_t 81UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const { 82 return getString(strings).compare(other.getString(strings)); 83} 84 85UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/) 86 : elements(NULL), elementsCapacity(0), elementsLength(0), 87 uchars(NULL), ucharsCapacity(0), ucharsLength(0) {} 88 89UCharsTrieBuilder::~UCharsTrieBuilder() { 90 delete[] elements; 91 uprv_free(uchars); 92} 93 94UCharsTrieBuilder & 95UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) { 96 if(U_FAILURE(errorCode)) { 97 return *this; 98 } 99 if(ucharsLength>0) { 100 // Cannot add elements after building. 101 errorCode=U_NO_WRITE_PERMISSION; 102 return *this; 103 } 104 if(elementsLength==elementsCapacity) { 105 int32_t newCapacity; 106 if(elementsCapacity==0) { 107 newCapacity=1024; 108 } else { 109 newCapacity=4*elementsCapacity; 110 } 111 UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity]; 112 if(newElements==NULL) { 113 errorCode=U_MEMORY_ALLOCATION_ERROR; 114 } 115 if(elementsLength>0) { 116 uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement)); 117 } 118 delete[] elements; 119 elements=newElements; 120 elementsCapacity=newCapacity; 121 } 122 elements[elementsLength++].setTo(s, value, strings, errorCode); 123 if(U_SUCCESS(errorCode) && strings.isBogus()) { 124 errorCode=U_MEMORY_ALLOCATION_ERROR; 125 } 126 return *this; 127} 128 129U_CDECL_BEGIN 130 131static int32_t U_CALLCONV 132compareElementStrings(const void *context, const void *left, const void *right) { 133 const UnicodeString *strings=reinterpret_cast<const UnicodeString *>(context); 134 const UCharsTrieElement *leftElement=reinterpret_cast<const UCharsTrieElement *>(left); 135 const UCharsTrieElement *rightElement=reinterpret_cast<const UCharsTrieElement *>(right); 136 return leftElement->compareStringTo(*rightElement, *strings); 137} 138 139U_CDECL_END 140 141UCharsTrie * 142UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 143 buildUChars(buildOption, errorCode); 144 UCharsTrie *newTrie=NULL; 145 if(U_SUCCESS(errorCode)) { 146 newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength)); 147 if(newTrie==NULL) { 148 errorCode=U_MEMORY_ALLOCATION_ERROR; 149 } else { 150 uchars=NULL; // The new trie now owns the array. 151 ucharsCapacity=0; 152 } 153 } 154 return newTrie; 155} 156 157UnicodeString & 158UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result, 159 UErrorCode &errorCode) { 160 buildUChars(buildOption, errorCode); 161 if(U_SUCCESS(errorCode)) { 162 result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength); 163 } 164 return result; 165} 166 167void 168UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 169 if(U_FAILURE(errorCode)) { 170 return; 171 } 172 if(uchars!=NULL && ucharsLength>0) { 173 // Already built. 174 return; 175 } 176 if(ucharsLength==0) { 177 if(elementsLength==0) { 178 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 179 return; 180 } 181 if(strings.isBogus()) { 182 errorCode=U_MEMORY_ALLOCATION_ERROR; 183 return; 184 } 185 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement), 186 compareElementStrings, &strings, 187 FALSE, // need not be a stable sort 188 &errorCode); 189 if(U_FAILURE(errorCode)) { 190 return; 191 } 192 // Duplicate strings are not allowed. 193 UnicodeString prev=elements[0].getString(strings); 194 for(int32_t i=1; i<elementsLength; ++i) { 195 UnicodeString current=elements[i].getString(strings); 196 if(prev==current) { 197 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 198 return; 199 } 200 prev.fastCopyFrom(current); 201 } 202 } 203 // Create and UChar-serialize the trie for the elements. 204 ucharsLength=0; 205 int32_t capacity=strings.length(); 206 if(capacity<1024) { 207 capacity=1024; 208 } 209 if(ucharsCapacity<capacity) { 210 uprv_free(uchars); 211 uchars=reinterpret_cast<UChar *>(uprv_malloc(capacity*2)); 212 if(uchars==NULL) { 213 errorCode=U_MEMORY_ALLOCATION_ERROR; 214 ucharsCapacity=0; 215 return; 216 } 217 ucharsCapacity=capacity; 218 } 219 StringTrieBuilder::build(buildOption, elementsLength, errorCode); 220 if(uchars==NULL) { 221 errorCode=U_MEMORY_ALLOCATION_ERROR; 222 } 223} 224 225int32_t 226UCharsTrieBuilder::getElementStringLength(int32_t i) const { 227 return elements[i].getStringLength(strings); 228} 229 230UChar 231UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const { 232 return elements[i].charAt(unitIndex, strings); 233} 234 235int32_t 236UCharsTrieBuilder::getElementValue(int32_t i) const { 237 return elements[i].getValue(); 238} 239 240int32_t 241UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const { 242 const UCharsTrieElement &firstElement=elements[first]; 243 const UCharsTrieElement &lastElement=elements[last]; 244 int32_t minStringLength=firstElement.getStringLength(strings); 245 while(++unitIndex<minStringLength && 246 firstElement.charAt(unitIndex, strings)== 247 lastElement.charAt(unitIndex, strings)) {} 248 return unitIndex; 249} 250 251int32_t 252UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const { 253 int32_t length=0; // Number of different units at unitIndex. 254 int32_t i=start; 255 do { 256 UChar unit=elements[i++].charAt(unitIndex, strings); 257 while(i<limit && unit==elements[i].charAt(unitIndex, strings)) { 258 ++i; 259 } 260 ++length; 261 } while(i<limit); 262 return length; 263} 264 265int32_t 266UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const { 267 do { 268 UChar unit=elements[i++].charAt(unitIndex, strings); 269 while(unit==elements[i].charAt(unitIndex, strings)) { 270 ++i; 271 } 272 } while(--count>0); 273 return i; 274} 275 276int32_t 277UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const { 278 while(unit==elements[i].charAt(unitIndex, strings)) { 279 ++i; 280 } 281 return i; 282} 283 284UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) 285 : LinearMatchNode(len, nextNode), s(units) { 286 hash=hash*37+uhash_hashUCharsN(units, len); 287} 288 289UBool 290UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const { 291 if(this==&other) { 292 return TRUE; 293 } 294 if(!LinearMatchNode::operator==(other)) { 295 return FALSE; 296 } 297 const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other; 298 return 0==u_memcmp(s, o.s, length); 299} 300 301void 302UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) { 303 UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder; 304 next->write(builder); 305 b.write(s, length); 306 offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1); 307} 308 309StringTrieBuilder::Node * 310UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 311 Node *nextNode) const { 312 return new UCTLinearMatchNode( 313 elements[i].getString(strings).getBuffer()+unitIndex, 314 length, 315 nextNode); 316} 317 318UBool 319UCharsTrieBuilder::ensureCapacity(int32_t length) { 320 if(uchars==NULL) { 321 return FALSE; // previous memory allocation had failed 322 } 323 if(length>ucharsCapacity) { 324 int32_t newCapacity=ucharsCapacity; 325 do { 326 newCapacity*=2; 327 } while(newCapacity<=length); 328 UChar *newUChars=reinterpret_cast<UChar *>(uprv_malloc(newCapacity*2)); 329 if(newUChars==NULL) { 330 // unable to allocate memory 331 uprv_free(uchars); 332 uchars=NULL; 333 ucharsCapacity=0; 334 return FALSE; 335 } 336 u_memcpy(newUChars+(newCapacity-ucharsLength), 337 uchars+(ucharsCapacity-ucharsLength), ucharsLength); 338 uprv_free(uchars); 339 uchars=newUChars; 340 ucharsCapacity=newCapacity; 341 } 342 return TRUE; 343} 344 345int32_t 346UCharsTrieBuilder::write(int32_t unit) { 347 int32_t newLength=ucharsLength+1; 348 if(ensureCapacity(newLength)) { 349 ucharsLength=newLength; 350 uchars[ucharsCapacity-ucharsLength]=(UChar)unit; 351 } 352 return ucharsLength; 353} 354 355int32_t 356UCharsTrieBuilder::write(const UChar *s, int32_t length) { 357 int32_t newLength=ucharsLength+length; 358 if(ensureCapacity(newLength)) { 359 ucharsLength=newLength; 360 u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length); 361 } 362 return ucharsLength; 363} 364 365int32_t 366UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) { 367 return write(elements[i].getString(strings).getBuffer()+unitIndex, length); 368} 369 370int32_t 371UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 372 if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) { 373 return write(i|(isFinal<<15)); 374 } 375 UChar intUnits[3]; 376 int32_t length; 377 if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) { 378 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead); 379 intUnits[1]=(UChar)(i>>16); 380 intUnits[2]=(UChar)i; 381 length=3; 382 // } else if(i<=UCharsTrie::kMaxOneUnitValue) { 383 // intUnits[0]=(UChar)(i); 384 // length=1; 385 } else { 386 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16)); 387 intUnits[1]=(UChar)i; 388 length=2; 389 } 390 intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15)); 391 return write(intUnits, length); 392} 393 394int32_t 395UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 396 if(!hasValue) { 397 return write(node); 398 } 399 UChar intUnits[3]; 400 int32_t length; 401 if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) { 402 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead); 403 intUnits[1]=(UChar)(value>>16); 404 intUnits[2]=(UChar)value; 405 length=3; 406 } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) { 407 intUnits[0]=(UChar)((value+1)<<6); 408 length=1; 409 } else { 410 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0)); 411 intUnits[1]=(UChar)value; 412 length=2; 413 } 414 intUnits[0]|=(UChar)node; 415 return write(intUnits, length); 416} 417 418int32_t 419UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 420 int32_t i=ucharsLength-jumpTarget; 421 U_ASSERT(i>=0); 422 if(i<=UCharsTrie::kMaxOneUnitDelta) { 423 return write(i); 424 } 425 UChar intUnits[3]; 426 int32_t length; 427 if(i<=UCharsTrie::kMaxTwoUnitDelta) { 428 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16)); 429 length=1; 430 } else { 431 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead); 432 intUnits[1]=(UChar)(i>>16); 433 length=2; 434 } 435 intUnits[length++]=(UChar)i; 436 return write(intUnits, length); 437} 438 439U_NAMESPACE_END 440