1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2010-2012, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* file name: stringtriebuilder.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2010dec24 14* created by: Markus W. Scherer 15*/ 16 17#include "utypeinfo.h" // for 'typeid' to work 18#include "unicode/utypes.h" 19#include "unicode/stringtriebuilder.h" 20#include "uassert.h" 21#include "uhash.h" 22 23U_CDECL_BEGIN 24 25static int32_t U_CALLCONV 26hashStringTrieNode(const UHashTok key) { 27 return icu::StringTrieBuilder::hashNode(key.pointer); 28} 29 30static UBool U_CALLCONV 31equalStringTrieNodes(const UHashTok key1, const UHashTok key2) { 32 return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer); 33} 34 35U_CDECL_END 36 37U_NAMESPACE_BEGIN 38 39StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {} 40 41StringTrieBuilder::~StringTrieBuilder() { 42 deleteCompactBuilder(); 43} 44 45void 46StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) { 47 if(U_FAILURE(errorCode)) { 48 return; 49 } 50 nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL, 51 sizeGuess, &errorCode); 52 if(U_SUCCESS(errorCode)) { 53 if(nodes==NULL) { 54 errorCode=U_MEMORY_ALLOCATION_ERROR; 55 } else { 56 uhash_setKeyDeleter(nodes, uprv_deleteUObject); 57 } 58 } 59} 60 61void 62StringTrieBuilder::deleteCompactBuilder() { 63 uhash_close(nodes); 64 nodes=NULL; 65} 66 67void 68StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength, 69 UErrorCode &errorCode) { 70 if(buildOption==USTRINGTRIE_BUILD_FAST) { 71 writeNode(0, elementsLength, 0); 72 } else /* USTRINGTRIE_BUILD_SMALL */ { 73 createCompactBuilder(2*elementsLength, errorCode); 74 Node *root=makeNode(0, elementsLength, 0, errorCode); 75 if(U_SUCCESS(errorCode)) { 76 root->markRightEdgesFirst(-1); 77 root->write(*this); 78 } 79 deleteCompactBuilder(); 80 } 81} 82 83// Requires start<limit, 84// and all strings of the [start..limit[ elements must be sorted and 85// have a common prefix of length unitIndex. 86int32_t 87StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) { 88 UBool hasValue=FALSE; 89 int32_t value=0; 90 int32_t type; 91 if(unitIndex==getElementStringLength(start)) { 92 // An intermediate or final value. 93 value=getElementValue(start++); 94 if(start==limit) { 95 return writeValueAndFinal(value, TRUE); // final-value node 96 } 97 hasValue=TRUE; 98 } 99 // Now all [start..limit[ strings are longer than unitIndex. 100 int32_t minUnit=getElementUnit(start, unitIndex); 101 int32_t maxUnit=getElementUnit(limit-1, unitIndex); 102 if(minUnit==maxUnit) { 103 // Linear-match node: All strings have the same character at unitIndex. 104 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); 105 writeNode(start, limit, lastUnitIndex); 106 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. 107 int32_t length=lastUnitIndex-unitIndex; 108 int32_t maxLinearMatchLength=getMaxLinearMatchLength(); 109 while(length>maxLinearMatchLength) { 110 lastUnitIndex-=maxLinearMatchLength; 111 length-=maxLinearMatchLength; 112 writeElementUnits(start, lastUnitIndex, maxLinearMatchLength); 113 write(getMinLinearMatch()+maxLinearMatchLength-1); 114 } 115 writeElementUnits(start, unitIndex, length); 116 type=getMinLinearMatch()+length-1; 117 } else { 118 // Branch node. 119 int32_t length=countElementUnits(start, limit, unitIndex); 120 // length>=2 because minUnit!=maxUnit. 121 writeBranchSubNode(start, limit, unitIndex, length); 122 if(--length<getMinLinearMatch()) { 123 type=length; 124 } else { 125 write(length); 126 type=0; 127 } 128 } 129 return writeValueAndType(hasValue, value, type); 130} 131 132// start<limit && all strings longer than unitIndex && 133// length different units at unitIndex 134int32_t 135StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) { 136 UChar middleUnits[kMaxSplitBranchLevels]; 137 int32_t lessThan[kMaxSplitBranchLevels]; 138 int32_t ltLength=0; 139 while(length>getMaxBranchLinearSubNodeLength()) { 140 // Branch on the middle unit. 141 // First, find the middle unit. 142 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); 143 // Encode the less-than branch first. 144 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit 145 lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2); 146 ++ltLength; 147 // Continue for the greater-or-equal branch. 148 start=i; 149 length=length-length/2; 150 } 151 // For each unit, find its elements array start and whether it has a final value. 152 int32_t starts[kMaxBranchLinearSubNodeLength]; 153 UBool isFinal[kMaxBranchLinearSubNodeLength-1]; 154 int32_t unitNumber=0; 155 do { 156 int32_t i=starts[unitNumber]=start; 157 UChar unit=getElementUnit(i++, unitIndex); 158 i=indexOfElementWithNextUnit(i, unitIndex, unit); 159 isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start); 160 start=i; 161 } while(++unitNumber<length-1); 162 // unitNumber==length-1, and the maxUnit elements range is [start..limit[ 163 starts[unitNumber]=start; 164 165 // Write the sub-nodes in reverse order: The jump lengths are deltas from 166 // after their own positions, so if we wrote the minUnit sub-node first, 167 // then its jump delta would be larger. 168 // Instead we write the minUnit sub-node last, for a shorter delta. 169 int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1]; 170 do { 171 --unitNumber; 172 if(!isFinal[unitNumber]) { 173 jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1); 174 } 175 } while(unitNumber>0); 176 // The maxUnit sub-node is written as the very last one because we do 177 // not jump for it at all. 178 unitNumber=length-1; 179 writeNode(start, limit, unitIndex+1); 180 int32_t offset=write(getElementUnit(start, unitIndex)); 181 // Write the rest of this node's unit-value pairs. 182 while(--unitNumber>=0) { 183 start=starts[unitNumber]; 184 int32_t value; 185 if(isFinal[unitNumber]) { 186 // Write the final value for the one string ending with this unit. 187 value=getElementValue(start); 188 } else { 189 // Write the delta to the start position of the sub-node. 190 value=offset-jumpTargets[unitNumber]; 191 } 192 writeValueAndFinal(value, isFinal[unitNumber]); 193 offset=write(getElementUnit(start, unitIndex)); 194 } 195 // Write the split-branch nodes. 196 while(ltLength>0) { 197 --ltLength; 198 writeDeltaTo(lessThan[ltLength]); 199 offset=write(middleUnits[ltLength]); 200 } 201 return offset; 202} 203 204// Requires start<limit, 205// and all strings of the [start..limit[ elements must be sorted and 206// have a common prefix of length unitIndex. 207StringTrieBuilder::Node * 208StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) { 209 if(U_FAILURE(errorCode)) { 210 return NULL; 211 } 212 UBool hasValue=FALSE; 213 int32_t value=0; 214 if(unitIndex==getElementStringLength(start)) { 215 // An intermediate or final value. 216 value=getElementValue(start++); 217 if(start==limit) { 218 return registerFinalValue(value, errorCode); 219 } 220 hasValue=TRUE; 221 } 222 Node *node; 223 // Now all [start..limit[ strings are longer than unitIndex. 224 int32_t minUnit=getElementUnit(start, unitIndex); 225 int32_t maxUnit=getElementUnit(limit-1, unitIndex); 226 if(minUnit==maxUnit) { 227 // Linear-match node: All strings have the same character at unitIndex. 228 int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex); 229 Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode); 230 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. 231 int32_t length=lastUnitIndex-unitIndex; 232 int32_t maxLinearMatchLength=getMaxLinearMatchLength(); 233 while(length>maxLinearMatchLength) { 234 lastUnitIndex-=maxLinearMatchLength; 235 length-=maxLinearMatchLength; 236 node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode); 237 nextNode=registerNode(node, errorCode); 238 } 239 node=createLinearMatchNode(start, unitIndex, length, nextNode); 240 } else { 241 // Branch node. 242 int32_t length=countElementUnits(start, limit, unitIndex); 243 // length>=2 because minUnit!=maxUnit. 244 Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode); 245 node=new BranchHeadNode(length, subNode); 246 } 247 if(hasValue && node!=NULL) { 248 if(matchNodesCanHaveValues()) { 249 ((ValueNode *)node)->setValue(value); 250 } else { 251 node=new IntermediateValueNode(value, registerNode(node, errorCode)); 252 } 253 } 254 return registerNode(node, errorCode); 255} 256 257// start<limit && all strings longer than unitIndex && 258// length different units at unitIndex 259StringTrieBuilder::Node * 260StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 261 int32_t length, UErrorCode &errorCode) { 262 if(U_FAILURE(errorCode)) { 263 return NULL; 264 } 265 UChar middleUnits[kMaxSplitBranchLevels]; 266 Node *lessThan[kMaxSplitBranchLevels]; 267 int32_t ltLength=0; 268 while(length>getMaxBranchLinearSubNodeLength()) { 269 // Branch on the middle unit. 270 // First, find the middle unit. 271 int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2); 272 // Create the less-than branch. 273 middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit 274 lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode); 275 ++ltLength; 276 // Continue for the greater-or-equal branch. 277 start=i; 278 length=length-length/2; 279 } 280 if(U_FAILURE(errorCode)) { 281 return NULL; 282 } 283 ListBranchNode *listNode=new ListBranchNode(); 284 if(listNode==NULL) { 285 errorCode=U_MEMORY_ALLOCATION_ERROR; 286 return NULL; 287 } 288 // For each unit, find its elements array start and whether it has a final value. 289 int32_t unitNumber=0; 290 do { 291 int32_t i=start; 292 UChar unit=getElementUnit(i++, unitIndex); 293 i=indexOfElementWithNextUnit(i, unitIndex, unit); 294 if(start==i-1 && unitIndex+1==getElementStringLength(start)) { 295 listNode->add(unit, getElementValue(start)); 296 } else { 297 listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode)); 298 } 299 start=i; 300 } while(++unitNumber<length-1); 301 // unitNumber==length-1, and the maxUnit elements range is [start..limit[ 302 UChar unit=getElementUnit(start, unitIndex); 303 if(start==limit-1 && unitIndex+1==getElementStringLength(start)) { 304 listNode->add(unit, getElementValue(start)); 305 } else { 306 listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode)); 307 } 308 Node *node=registerNode(listNode, errorCode); 309 // Create the split-branch nodes. 310 while(ltLength>0) { 311 --ltLength; 312 node=registerNode( 313 new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode); 314 } 315 return node; 316} 317 318StringTrieBuilder::Node * 319StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) { 320 if(U_FAILURE(errorCode)) { 321 delete newNode; 322 return NULL; 323 } 324 if(newNode==NULL) { 325 errorCode=U_MEMORY_ALLOCATION_ERROR; 326 return NULL; 327 } 328 const UHashElement *old=uhash_find(nodes, newNode); 329 if(old!=NULL) { 330 delete newNode; 331 return (Node *)old->key.pointer; 332 } 333 // If uhash_puti() returns a non-zero value from an equivalent, previously 334 // registered node, then uhash_find() failed to find that and we will leak newNode. 335#if U_DEBUG 336 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. 337#endif 338 uhash_puti(nodes, newNode, 1, &errorCode); 339 U_ASSERT(oldValue==0); 340 if(U_FAILURE(errorCode)) { 341 delete newNode; 342 return NULL; 343 } 344 return newNode; 345} 346 347StringTrieBuilder::Node * 348StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) { 349 if(U_FAILURE(errorCode)) { 350 return NULL; 351 } 352 FinalValueNode key(value); 353 const UHashElement *old=uhash_find(nodes, &key); 354 if(old!=NULL) { 355 return (Node *)old->key.pointer; 356 } 357 Node *newNode=new FinalValueNode(value); 358 if(newNode==NULL) { 359 errorCode=U_MEMORY_ALLOCATION_ERROR; 360 return NULL; 361 } 362 // If uhash_puti() returns a non-zero value from an equivalent, previously 363 // registered node, then uhash_find() failed to find that and we will leak newNode. 364#if U_DEBUG 365 int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue. 366#endif 367 uhash_puti(nodes, newNode, 1, &errorCode); 368 U_ASSERT(oldValue==0); 369 if(U_FAILURE(errorCode)) { 370 delete newNode; 371 return NULL; 372 } 373 return newNode; 374} 375 376UBool 377StringTrieBuilder::hashNode(const void *node) { 378 return ((const Node *)node)->hashCode(); 379} 380 381UBool 382StringTrieBuilder::equalNodes(const void *left, const void *right) { 383 return *(const Node *)left==*(const Node *)right; 384} 385 386UBool 387StringTrieBuilder::Node::operator==(const Node &other) const { 388 return this==&other || (typeid(*this)==typeid(other) && hash==other.hash); 389} 390 391int32_t 392StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) { 393 if(offset==0) { 394 offset=edgeNumber; 395 } 396 return edgeNumber; 397} 398 399UBool 400StringTrieBuilder::FinalValueNode::operator==(const Node &other) const { 401 if(this==&other) { 402 return TRUE; 403 } 404 if(!Node::operator==(other)) { 405 return FALSE; 406 } 407 const FinalValueNode &o=(const FinalValueNode &)other; 408 return value==o.value; 409} 410 411void 412StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) { 413 offset=builder.writeValueAndFinal(value, TRUE); 414} 415 416UBool 417StringTrieBuilder::ValueNode::operator==(const Node &other) const { 418 if(this==&other) { 419 return TRUE; 420 } 421 if(!Node::operator==(other)) { 422 return FALSE; 423 } 424 const ValueNode &o=(const ValueNode &)other; 425 return hasValue==o.hasValue && (!hasValue || value==o.value); 426} 427 428UBool 429StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const { 430 if(this==&other) { 431 return TRUE; 432 } 433 if(!ValueNode::operator==(other)) { 434 return FALSE; 435 } 436 const IntermediateValueNode &o=(const IntermediateValueNode &)other; 437 return next==o.next; 438} 439 440int32_t 441StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) { 442 if(offset==0) { 443 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 444 } 445 return edgeNumber; 446} 447 448void 449StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) { 450 next->write(builder); 451 offset=builder.writeValueAndFinal(value, FALSE); 452} 453 454UBool 455StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const { 456 if(this==&other) { 457 return TRUE; 458 } 459 if(!ValueNode::operator==(other)) { 460 return FALSE; 461 } 462 const LinearMatchNode &o=(const LinearMatchNode &)other; 463 return length==o.length && next==o.next; 464} 465 466int32_t 467StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) { 468 if(offset==0) { 469 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 470 } 471 return edgeNumber; 472} 473 474UBool 475StringTrieBuilder::ListBranchNode::operator==(const Node &other) const { 476 if(this==&other) { 477 return TRUE; 478 } 479 if(!Node::operator==(other)) { 480 return FALSE; 481 } 482 const ListBranchNode &o=(const ListBranchNode &)other; 483 for(int32_t i=0; i<length; ++i) { 484 if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) { 485 return FALSE; 486 } 487 } 488 return TRUE; 489} 490 491int32_t 492StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) { 493 if(offset==0) { 494 firstEdgeNumber=edgeNumber; 495 int32_t step=0; 496 int32_t i=length; 497 do { 498 Node *edge=equal[--i]; 499 if(edge!=NULL) { 500 edgeNumber=edge->markRightEdgesFirst(edgeNumber-step); 501 } 502 // For all but the rightmost edge, decrement the edge number. 503 step=1; 504 } while(i>0); 505 offset=edgeNumber; 506 } 507 return edgeNumber; 508} 509 510void 511StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) { 512 // Write the sub-nodes in reverse order: The jump lengths are deltas from 513 // after their own positions, so if we wrote the minUnit sub-node first, 514 // then its jump delta would be larger. 515 // Instead we write the minUnit sub-node last, for a shorter delta. 516 int32_t unitNumber=length-1; 517 Node *rightEdge=equal[unitNumber]; 518 int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset(); 519 do { 520 --unitNumber; 521 if(equal[unitNumber]!=NULL) { 522 equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); 523 } 524 } while(unitNumber>0); 525 // The maxUnit sub-node is written as the very last one because we do 526 // not jump for it at all. 527 unitNumber=length-1; 528 if(rightEdge==NULL) { 529 builder.writeValueAndFinal(values[unitNumber], TRUE); 530 } else { 531 rightEdge->write(builder); 532 } 533 offset=builder.write(units[unitNumber]); 534 // Write the rest of this node's unit-value pairs. 535 while(--unitNumber>=0) { 536 int32_t value; 537 UBool isFinal; 538 if(equal[unitNumber]==NULL) { 539 // Write the final value for the one string ending with this unit. 540 value=values[unitNumber]; 541 isFinal=TRUE; 542 } else { 543 // Write the delta to the start position of the sub-node. 544 U_ASSERT(equal[unitNumber]->getOffset()>0); 545 value=offset-equal[unitNumber]->getOffset(); 546 isFinal=FALSE; 547 } 548 builder.writeValueAndFinal(value, isFinal); 549 offset=builder.write(units[unitNumber]); 550 } 551} 552 553UBool 554StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const { 555 if(this==&other) { 556 return TRUE; 557 } 558 if(!Node::operator==(other)) { 559 return FALSE; 560 } 561 const SplitBranchNode &o=(const SplitBranchNode &)other; 562 return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; 563} 564 565int32_t 566StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) { 567 if(offset==0) { 568 firstEdgeNumber=edgeNumber; 569 edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber); 570 offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1); 571 } 572 return edgeNumber; 573} 574 575void 576StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) { 577 // Encode the less-than branch first. 578 lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder); 579 // Encode the greater-or-equal branch last because we do not jump for it at all. 580 greaterOrEqual->write(builder); 581 // Write this node. 582 U_ASSERT(lessThan->getOffset()>0); 583 builder.writeDeltaTo(lessThan->getOffset()); // less-than 584 offset=builder.write(unit); 585} 586 587UBool 588StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const { 589 if(this==&other) { 590 return TRUE; 591 } 592 if(!ValueNode::operator==(other)) { 593 return FALSE; 594 } 595 const BranchHeadNode &o=(const BranchHeadNode &)other; 596 return length==o.length && next==o.next; 597} 598 599int32_t 600StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) { 601 if(offset==0) { 602 offset=edgeNumber=next->markRightEdgesFirst(edgeNumber); 603 } 604 return edgeNumber; 605} 606 607void 608StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) { 609 next->write(builder); 610 if(length<=builder.getMinLinearMatch()) { 611 offset=builder.writeValueAndType(hasValue, value, length-1); 612 } else { 613 builder.write(length-1); 614 offset=builder.writeValueAndType(hasValue, value, 0); 615 } 616} 617 618U_NAMESPACE_END 619