1/* 2******************************************************************************* 3* Copyright (C) 2010-2012, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: bytestriebuilder.cpp 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* created on: 2010sep25 12* created by: Markus W. Scherer 13*/ 14 15#include "unicode/utypes.h" 16#include "unicode/bytestrie.h" 17#include "unicode/bytestriebuilder.h" 18#include "unicode/stringpiece.h" 19#include "charstr.h" 20#include "cmemory.h" 21#include "uhash.h" 22#include "uarrsort.h" 23#include "uassert.h" 24#include "ustr_imp.h" 25 26U_NAMESPACE_BEGIN 27 28/* 29 * Note: This builder implementation stores (bytes, value) pairs with full copies 30 * of the byte sequences, until the BytesTrie is built. 31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 32 */ 33 34class BytesTrieElement : public UMemory { 35public: 36 // Use compiler's default constructor, initializes nothing. 37 38 void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode); 39 40 StringPiece getString(const CharString &strings) const { 41 int32_t offset=stringOffset; 42 int32_t length; 43 if(offset>=0) { 44 length=(uint8_t)strings[offset++]; 45 } else { 46 offset=~offset; 47 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 48 offset+=2; 49 } 50 return StringPiece(strings.data()+offset, length); 51 } 52 int32_t getStringLength(const CharString &strings) const { 53 int32_t offset=stringOffset; 54 if(offset>=0) { 55 return (uint8_t)strings[offset]; 56 } else { 57 offset=~offset; 58 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 59 } 60 } 61 62 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } 63 64 int32_t getValue() const { return value; } 65 66 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; 67 68private: 69 const char *data(const CharString &strings) const { 70 int32_t offset=stringOffset; 71 if(offset>=0) { 72 ++offset; 73 } else { 74 offset=~offset+2; 75 } 76 return strings.data()+offset; 77 } 78 79 // If the stringOffset is non-negative, then the first strings byte contains 80 // the string length. 81 // If the stringOffset is negative, then the first two strings bytes contain 82 // the string length (big-endian), and the offset needs to be bit-inverted. 83 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) 84 int32_t stringOffset; 85 int32_t value; 86}; 87 88void 89BytesTrieElement::setTo(const StringPiece &s, int32_t val, 90 CharString &strings, UErrorCode &errorCode) { 91 if(U_FAILURE(errorCode)) { 92 return; 93 } 94 int32_t length=s.length(); 95 if(length>0xffff) { 96 // Too long: We store the length in 1 or 2 bytes. 97 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 98 return; 99 } 100 int32_t offset=strings.length(); 101 if(length>0xff) { 102 offset=~offset; 103 strings.append((char)(length>>8), errorCode); 104 } 105 strings.append((char)length, errorCode); 106 stringOffset=offset; 107 value=val; 108 strings.append(s, errorCode); 109} 110 111int32_t 112BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { 113 // TODO: add StringPiece::compare(), see ticket #8187 114 StringPiece thisString=getString(strings); 115 StringPiece otherString=other.getString(strings); 116 int32_t lengthDiff=thisString.length()-otherString.length(); 117 int32_t commonLength; 118 if(lengthDiff<=0) { 119 commonLength=thisString.length(); 120 } else { 121 commonLength=otherString.length(); 122 } 123 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); 124 return diff!=0 ? diff : lengthDiff; 125} 126 127BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) 128 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), 129 bytes(NULL), bytesCapacity(0), bytesLength(0) { 130 if(U_FAILURE(errorCode)) { 131 return; 132 } 133 strings=new CharString(); 134 if(strings==NULL) { 135 errorCode=U_MEMORY_ALLOCATION_ERROR; 136 } 137} 138 139BytesTrieBuilder::~BytesTrieBuilder() { 140 delete strings; 141 delete[] elements; 142 uprv_free(bytes); 143} 144 145BytesTrieBuilder & 146BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) { 147 if(U_FAILURE(errorCode)) { 148 return *this; 149 } 150 if(bytesLength>0) { 151 // Cannot add elements after building. 152 errorCode=U_NO_WRITE_PERMISSION; 153 return *this; 154 } 155 if(elementsLength==elementsCapacity) { 156 int32_t newCapacity; 157 if(elementsCapacity==0) { 158 newCapacity=1024; 159 } else { 160 newCapacity=4*elementsCapacity; 161 } 162 BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; 163 if(newElements==NULL) { 164 errorCode=U_MEMORY_ALLOCATION_ERROR; 165 return *this; // error instead of dereferencing null 166 } 167 if(elementsLength>0) { 168 uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement)); 169 } 170 delete[] elements; 171 elements=newElements; 172 elementsCapacity=newCapacity; 173 } 174 elements[elementsLength++].setTo(s, value, *strings, errorCode); 175 return *this; 176} 177 178U_CDECL_BEGIN 179 180static int32_t U_CALLCONV 181compareElementStrings(const void *context, const void *left, const void *right) { 182 const CharString *strings=static_cast<const CharString *>(context); 183 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left); 184 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right); 185 return leftElement->compareStringTo(*rightElement, *strings); 186} 187 188U_CDECL_END 189 190BytesTrie * 191BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 192 buildBytes(buildOption, errorCode); 193 BytesTrie *newTrie=NULL; 194 if(U_SUCCESS(errorCode)) { 195 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); 196 if(newTrie==NULL) { 197 errorCode=U_MEMORY_ALLOCATION_ERROR; 198 } else { 199 bytes=NULL; // The new trie now owns the array. 200 bytesCapacity=0; 201 } 202 } 203 return newTrie; 204} 205 206StringPiece 207BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 208 buildBytes(buildOption, errorCode); 209 StringPiece result; 210 if(U_SUCCESS(errorCode)) { 211 result.set(bytes+(bytesCapacity-bytesLength), bytesLength); 212 } 213 return result; 214} 215 216void 217BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 218 if(U_FAILURE(errorCode)) { 219 return; 220 } 221 if(bytes!=NULL && bytesLength>0) { 222 // Already built. 223 return; 224 } 225 if(bytesLength==0) { 226 if(elementsLength==0) { 227 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 228 return; 229 } 230 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), 231 compareElementStrings, strings, 232 FALSE, // need not be a stable sort 233 &errorCode); 234 if(U_FAILURE(errorCode)) { 235 return; 236 } 237 // Duplicate strings are not allowed. 238 StringPiece prev=elements[0].getString(*strings); 239 for(int32_t i=1; i<elementsLength; ++i) { 240 StringPiece current=elements[i].getString(*strings); 241 if(prev==current) { 242 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 243 return; 244 } 245 prev=current; 246 } 247 } 248 // Create and byte-serialize the trie for the elements. 249 bytesLength=0; 250 int32_t capacity=strings->length(); 251 if(capacity<1024) { 252 capacity=1024; 253 } 254 if(bytesCapacity<capacity) { 255 uprv_free(bytes); 256 bytes=static_cast<char *>(uprv_malloc(capacity)); 257 if(bytes==NULL) { 258 errorCode=U_MEMORY_ALLOCATION_ERROR; 259 bytesCapacity=0; 260 return; 261 } 262 bytesCapacity=capacity; 263 } 264 StringTrieBuilder::build(buildOption, elementsLength, errorCode); 265 if(bytes==NULL) { 266 errorCode=U_MEMORY_ALLOCATION_ERROR; 267 } 268} 269 270BytesTrieBuilder & 271BytesTrieBuilder::clear() { 272 strings->clear(); 273 elementsLength=0; 274 bytesLength=0; 275 return *this; 276} 277 278int32_t 279BytesTrieBuilder::getElementStringLength(int32_t i) const { 280 return elements[i].getStringLength(*strings); 281} 282 283UChar 284BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { 285 return (uint8_t)elements[i].charAt(byteIndex, *strings); 286} 287 288int32_t 289BytesTrieBuilder::getElementValue(int32_t i) const { 290 return elements[i].getValue(); 291} 292 293int32_t 294BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { 295 const BytesTrieElement &firstElement=elements[first]; 296 const BytesTrieElement &lastElement=elements[last]; 297 int32_t minStringLength=firstElement.getStringLength(*strings); 298 while(++byteIndex<minStringLength && 299 firstElement.charAt(byteIndex, *strings)== 300 lastElement.charAt(byteIndex, *strings)) {} 301 return byteIndex; 302} 303 304int32_t 305BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const { 306 int32_t length=0; // Number of different bytes at byteIndex. 307 int32_t i=start; 308 do { 309 char byte=elements[i++].charAt(byteIndex, *strings); 310 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) { 311 ++i; 312 } 313 ++length; 314 } while(i<limit); 315 return length; 316} 317 318int32_t 319BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const { 320 do { 321 char byte=elements[i++].charAt(byteIndex, *strings); 322 while(byte==elements[i].charAt(byteIndex, *strings)) { 323 ++i; 324 } 325 } while(--count>0); 326 return i; 327} 328 329int32_t 330BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { 331 char b=(char)byte; 332 while(b==elements[i].charAt(byteIndex, *strings)) { 333 ++i; 334 } 335 return i; 336} 337 338BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) 339 : LinearMatchNode(len, nextNode), s(bytes) { 340 hash=hash*37+ustr_hashCharsN(bytes, len); 341} 342 343UBool 344BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { 345 if(this==&other) { 346 return TRUE; 347 } 348 if(!LinearMatchNode::operator==(other)) { 349 return FALSE; 350 } 351 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; 352 return 0==uprv_memcmp(s, o.s, length); 353} 354 355void 356BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { 357 BytesTrieBuilder &b=(BytesTrieBuilder &)builder; 358 next->write(builder); 359 b.write(s, length); 360 offset=b.write(b.getMinLinearMatch()+length-1); 361} 362 363StringTrieBuilder::Node * 364BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, 365 Node *nextNode) const { 366 return new BTLinearMatchNode( 367 elements[i].getString(*strings).data()+byteIndex, 368 length, 369 nextNode); 370} 371 372UBool 373BytesTrieBuilder::ensureCapacity(int32_t length) { 374 if(bytes==NULL) { 375 return FALSE; // previous memory allocation had failed 376 } 377 if(length>bytesCapacity) { 378 int32_t newCapacity=bytesCapacity; 379 do { 380 newCapacity*=2; 381 } while(newCapacity<=length); 382 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity)); 383 if(newBytes==NULL) { 384 // unable to allocate memory 385 uprv_free(bytes); 386 bytes=NULL; 387 bytesCapacity=0; 388 return FALSE; 389 } 390 uprv_memcpy(newBytes+(newCapacity-bytesLength), 391 bytes+(bytesCapacity-bytesLength), bytesLength); 392 uprv_free(bytes); 393 bytes=newBytes; 394 bytesCapacity=newCapacity; 395 } 396 return TRUE; 397} 398 399int32_t 400BytesTrieBuilder::write(int32_t byte) { 401 int32_t newLength=bytesLength+1; 402 if(ensureCapacity(newLength)) { 403 bytesLength=newLength; 404 bytes[bytesCapacity-bytesLength]=(char)byte; 405 } 406 return bytesLength; 407} 408 409int32_t 410BytesTrieBuilder::write(const char *b, int32_t length) { 411 int32_t newLength=bytesLength+length; 412 if(ensureCapacity(newLength)) { 413 bytesLength=newLength; 414 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); 415 } 416 return bytesLength; 417} 418 419int32_t 420BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { 421 return write(elements[i].getString(*strings).data()+byteIndex, length); 422} 423 424int32_t 425BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 426 if(0<=i && i<=BytesTrie::kMaxOneByteValue) { 427 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); 428 } 429 char intBytes[5]; 430 int32_t length=1; 431 if(i<0 || i>0xffffff) { 432 intBytes[0]=(char)BytesTrie::kFiveByteValueLead; 433 intBytes[1]=(char)((uint32_t)i>>24); 434 intBytes[2]=(char)((uint32_t)i>>16); 435 intBytes[3]=(char)((uint32_t)i>>8); 436 intBytes[4]=(char)i; 437 length=5; 438 // } else if(i<=BytesTrie::kMaxOneByteValue) { 439 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); 440 } else { 441 if(i<=BytesTrie::kMaxTwoByteValue) { 442 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); 443 } else { 444 if(i<=BytesTrie::kMaxThreeByteValue) { 445 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); 446 } else { 447 intBytes[0]=(char)BytesTrie::kFourByteValueLead; 448 intBytes[1]=(char)(i>>16); 449 length=2; 450 } 451 intBytes[length++]=(char)(i>>8); 452 } 453 intBytes[length++]=(char)i; 454 } 455 intBytes[0]=(char)((intBytes[0]<<1)|isFinal); 456 return write(intBytes, length); 457} 458 459int32_t 460BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 461 int32_t offset=write(node); 462 if(hasValue) { 463 offset=writeValueAndFinal(value, FALSE); 464 } 465 return offset; 466} 467 468int32_t 469BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 470 int32_t i=bytesLength-jumpTarget; 471 U_ASSERT(i>=0); 472 if(i<=BytesTrie::kMaxOneByteDelta) { 473 return write(i); 474 } 475 char intBytes[5]; 476 int32_t length; 477 if(i<=BytesTrie::kMaxTwoByteDelta) { 478 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); 479 length=1; 480 } else { 481 if(i<=BytesTrie::kMaxThreeByteDelta) { 482 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); 483 length=2; 484 } else { 485 if(i<=0xffffff) { 486 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; 487 length=3; 488 } else { 489 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; 490 intBytes[1]=(char)(i>>24); 491 length=4; 492 } 493 intBytes[1]=(char)(i>>16); 494 } 495 intBytes[1]=(char)(i>>8); 496 } 497 intBytes[length++]=(char)i; 498 return write(intBytes, length); 499} 500 501U_NAMESPACE_END 502