1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4****************************************************************************** 5* 6* Copyright (C) 2001-2012, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9****************************************************************************** 10* file name: utrie.cpp 11* encoding: US-ASCII 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2001oct20 16* created by: Markus W. Scherer 17* 18* This is a common implementation of a "folded" trie. 19* It is a kind of compressed, serializable table of 16- or 32-bit values associated with 20* Unicode code points (0..0x10ffff). 21*/ 22 23#ifdef UTRIE_DEBUG 24# include <stdio.h> 25#endif 26 27#include "unicode/utypes.h" 28#include "cmemory.h" 29#include "utrie.h" 30 31/* miscellaneous ------------------------------------------------------------ */ 32 33#undef ABS 34#define ABS(x) ((x)>=0 ? (x) : -(x)) 35 36static inline UBool 37equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { 38 while(length>0 && *s==*t) { 39 ++s; 40 ++t; 41 --length; 42 } 43 return (UBool)(length==0); 44} 45 46/* Building a trie ----------------------------------------------------------*/ 47 48U_CAPI UNewTrie * U_EXPORT2 49utrie_open(UNewTrie *fillIn, 50 uint32_t *aliasData, int32_t maxDataLength, 51 uint32_t initialValue, uint32_t leadUnitValue, 52 UBool latin1Linear) { 53 UNewTrie *trie; 54 int32_t i, j; 55 56 if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH || 57 (latin1Linear && maxDataLength<1024) 58 ) { 59 return NULL; 60 } 61 62 if(fillIn!=NULL) { 63 trie=fillIn; 64 } else { 65 trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie)); 66 if(trie==NULL) { 67 return NULL; 68 } 69 } 70 uprv_memset(trie, 0, sizeof(UNewTrie)); 71 trie->isAllocated= (UBool)(fillIn==NULL); 72 73 if(aliasData!=NULL) { 74 trie->data=aliasData; 75 trie->isDataAllocated=FALSE; 76 } else { 77 trie->data=(uint32_t *)uprv_malloc(maxDataLength*4); 78 if(trie->data==NULL) { 79 uprv_free(trie); 80 return NULL; 81 } 82 trie->isDataAllocated=TRUE; 83 } 84 85 /* preallocate and reset the first data block (block index 0) */ 86 j=UTRIE_DATA_BLOCK_LENGTH; 87 88 if(latin1Linear) { 89 /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */ 90 /* made sure above that maxDataLength>=1024 */ 91 92 /* set indexes to point to consecutive data blocks */ 93 i=0; 94 do { 95 /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */ 96 trie->index[i++]=j; 97 j+=UTRIE_DATA_BLOCK_LENGTH; 98 } while(i<(256>>UTRIE_SHIFT)); 99 } 100 101 /* reset the initially allocated blocks to the initial value */ 102 trie->dataLength=j; 103 while(j>0) { 104 trie->data[--j]=initialValue; 105 } 106 107 trie->leadUnitValue=leadUnitValue; 108 trie->indexLength=UTRIE_MAX_INDEX_LENGTH; 109 trie->dataCapacity=maxDataLength; 110 trie->isLatin1Linear=latin1Linear; 111 trie->isCompacted=FALSE; 112 return trie; 113} 114 115U_CAPI UNewTrie * U_EXPORT2 116utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) { 117 UNewTrie *trie; 118 UBool isDataAllocated; 119 120 /* do not clone if other is not valid or already compacted */ 121 if(other==NULL || other->data==NULL || other->isCompacted) { 122 return NULL; 123 } 124 125 /* clone data */ 126 if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) { 127 isDataAllocated=FALSE; 128 } else { 129 aliasDataCapacity=other->dataCapacity; 130 aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4); 131 if(aliasData==NULL) { 132 return NULL; 133 } 134 isDataAllocated=TRUE; 135 } 136 137 trie=utrie_open(fillIn, aliasData, aliasDataCapacity, 138 other->data[0], other->leadUnitValue, 139 other->isLatin1Linear); 140 if(trie==NULL) { 141 uprv_free(aliasData); 142 } else { 143 uprv_memcpy(trie->index, other->index, sizeof(trie->index)); 144 uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4); 145 trie->dataLength=other->dataLength; 146 trie->isDataAllocated=isDataAllocated; 147 } 148 149 return trie; 150} 151 152U_CAPI void U_EXPORT2 153utrie_close(UNewTrie *trie) { 154 if(trie!=NULL) { 155 if(trie->isDataAllocated) { 156 uprv_free(trie->data); 157 trie->data=NULL; 158 } 159 if(trie->isAllocated) { 160 uprv_free(trie); 161 } 162 } 163} 164 165U_CAPI uint32_t * U_EXPORT2 166utrie_getData(UNewTrie *trie, int32_t *pLength) { 167 if(trie==NULL || pLength==NULL) { 168 return NULL; 169 } 170 171 *pLength=trie->dataLength; 172 return trie->data; 173} 174 175static int32_t 176utrie_allocDataBlock(UNewTrie *trie) { 177 int32_t newBlock, newTop; 178 179 newBlock=trie->dataLength; 180 newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH; 181 if(newTop>trie->dataCapacity) { 182 /* out of memory in the data array */ 183 return -1; 184 } 185 trie->dataLength=newTop; 186 return newBlock; 187} 188 189/** 190 * No error checking for illegal arguments. 191 * 192 * @return -1 if no new data block available (out of memory in data array) 193 * @internal 194 */ 195static int32_t 196utrie_getDataBlock(UNewTrie *trie, UChar32 c) { 197 int32_t indexValue, newBlock; 198 199 c>>=UTRIE_SHIFT; 200 indexValue=trie->index[c]; 201 if(indexValue>0) { 202 return indexValue; 203 } 204 205 /* allocate a new data block */ 206 newBlock=utrie_allocDataBlock(trie); 207 if(newBlock<0) { 208 /* out of memory in the data array */ 209 return -1; 210 } 211 trie->index[c]=newBlock; 212 213 /* copy-on-write for a block from a setRange() */ 214 uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH); 215 return newBlock; 216} 217 218/** 219 * @return TRUE if the value was successfully set 220 */ 221U_CAPI UBool U_EXPORT2 222utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) { 223 int32_t block; 224 225 /* valid, uncompacted trie and valid c? */ 226 if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) { 227 return FALSE; 228 } 229 230 block=utrie_getDataBlock(trie, c); 231 if(block<0) { 232 return FALSE; 233 } 234 235 trie->data[block+(c&UTRIE_MASK)]=value; 236 return TRUE; 237} 238 239U_CAPI uint32_t U_EXPORT2 240utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) { 241 int32_t block; 242 243 /* valid, uncompacted trie and valid c? */ 244 if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) { 245 if(pInBlockZero!=NULL) { 246 *pInBlockZero=TRUE; 247 } 248 return 0; 249 } 250 251 block=trie->index[c>>UTRIE_SHIFT]; 252 if(pInBlockZero!=NULL) { 253 *pInBlockZero= (UBool)(block==0); 254 } 255 256 return trie->data[ABS(block)+(c&UTRIE_MASK)]; 257} 258 259/** 260 * @internal 261 */ 262static void 263utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit, 264 uint32_t value, uint32_t initialValue, UBool overwrite) { 265 uint32_t *pLimit; 266 267 pLimit=block+limit; 268 block+=start; 269 if(overwrite) { 270 while(block<pLimit) { 271 *block++=value; 272 } 273 } else { 274 while(block<pLimit) { 275 if(*block==initialValue) { 276 *block=value; 277 } 278 ++block; 279 } 280 } 281} 282 283U_CAPI UBool U_EXPORT2 284utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) { 285 /* 286 * repeat value in [start..limit[ 287 * mark index values for repeat-data blocks by setting bit 31 of the index values 288 * fill around existing values if any, if(overwrite) 289 */ 290 uint32_t initialValue; 291 int32_t block, rest, repeatBlock; 292 293 /* valid, uncompacted trie and valid indexes? */ 294 if( trie==NULL || trie->isCompacted || 295 (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit 296 ) { 297 return FALSE; 298 } 299 if(start==limit) { 300 return TRUE; /* nothing to do */ 301 } 302 303 initialValue=trie->data[0]; 304 if(start&UTRIE_MASK) { 305 UChar32 nextStart; 306 307 /* set partial block at [start..following block boundary[ */ 308 block=utrie_getDataBlock(trie, start); 309 if(block<0) { 310 return FALSE; 311 } 312 313 nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK; 314 if(nextStart<=limit) { 315 utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH, 316 value, initialValue, overwrite); 317 start=nextStart; 318 } else { 319 utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK, 320 value, initialValue, overwrite); 321 return TRUE; 322 } 323 } 324 325 /* number of positions in the last, partial block */ 326 rest=limit&UTRIE_MASK; 327 328 /* round down limit to a block boundary */ 329 limit&=~UTRIE_MASK; 330 331 /* iterate over all-value blocks */ 332 if(value==initialValue) { 333 repeatBlock=0; 334 } else { 335 repeatBlock=-1; 336 } 337 while(start<limit) { 338 /* get index value */ 339 block=trie->index[start>>UTRIE_SHIFT]; 340 if(block>0) { 341 /* already allocated, fill in value */ 342 utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite); 343 } else if(trie->data[-block]!=value && (block==0 || overwrite)) { 344 /* set the repeatBlock instead of the current block 0 or range block */ 345 if(repeatBlock>=0) { 346 trie->index[start>>UTRIE_SHIFT]=-repeatBlock; 347 } else { 348 /* create and set and fill the repeatBlock */ 349 repeatBlock=utrie_getDataBlock(trie, start); 350 if(repeatBlock<0) { 351 return FALSE; 352 } 353 354 /* set the negative block number to indicate that it is a repeat block */ 355 trie->index[start>>UTRIE_SHIFT]=-repeatBlock; 356 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE); 357 } 358 } 359 360 start+=UTRIE_DATA_BLOCK_LENGTH; 361 } 362 363 if(rest>0) { 364 /* set partial block at [last block boundary..limit[ */ 365 block=utrie_getDataBlock(trie, start); 366 if(block<0) { 367 return FALSE; 368 } 369 370 utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite); 371 } 372 373 return TRUE; 374} 375 376static int32_t 377_findSameIndexBlock(const int32_t *idx, int32_t indexLength, 378 int32_t otherBlock) { 379 int32_t block, i; 380 381 for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) { 382 for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) { 383 if(idx[block+i]!=idx[otherBlock+i]) { 384 break; 385 } 386 } 387 if(i==UTRIE_SURROGATE_BLOCK_COUNT) { 388 return block; 389 } 390 } 391 return indexLength; 392} 393 394/* 395 * Fold the normalization data for supplementary code points into 396 * a compact area on top of the BMP-part of the trie index, 397 * with the lead surrogates indexing this compact area. 398 * 399 * Duplicate the index values for lead surrogates: 400 * From inside the BMP area, where some may be overridden with folded values, 401 * to just after the BMP area, where they can be retrieved for 402 * code point lookups. 403 */ 404static void 405utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) { 406 int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT]; 407 int32_t *idx; 408 uint32_t value; 409 UChar32 c; 410 int32_t indexLength, block; 411#ifdef UTRIE_DEBUG 412 int countLeadCUWithData=0; 413#endif 414 415 idx=trie->index; 416 417 /* copy the lead surrogate indexes into a temporary array */ 418 uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT); 419 420 /* 421 * set all values for lead surrogate code *units* to leadUnitValue 422 * so that, by default, runtime lookups will find no data for associated 423 * supplementary code points, unless there is data for such code points 424 * which will result in a non-zero folding value below that is set for 425 * the respective lead units 426 * 427 * the above saved the indexes for surrogate code *points* 428 * fill the indexes with simplified code from utrie_setRange32() 429 */ 430 if(trie->leadUnitValue==trie->data[0]) { 431 block=0; /* leadUnitValue==initialValue, use all-initial-value block */ 432 } else { 433 /* create and fill the repeatBlock */ 434 block=utrie_allocDataBlock(trie); 435 if(block<0) { 436 /* data table overflow */ 437 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 438 return; 439 } 440 utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE); 441 block=-block; /* negative block number to indicate that it is a repeat block */ 442 } 443 for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) { 444 trie->index[c]=block; 445 } 446 447 /* 448 * Fold significant index values into the area just after the BMP indexes. 449 * In case the first lead surrogate has significant data, 450 * its index block must be used first (in which case the folding is a no-op). 451 * Later all folded index blocks are moved up one to insert the copied 452 * lead surrogate indexes. 453 */ 454 indexLength=UTRIE_BMP_INDEX_LENGTH; 455 456 /* search for any index (stage 1) entries for supplementary code points */ 457 for(c=0x10000; c<0x110000;) { 458 if(idx[c>>UTRIE_SHIFT]!=0) { 459 /* there is data, treat the full block for a lead surrogate */ 460 c&=~0x3ff; 461 462#ifdef UTRIE_DEBUG 463 ++countLeadCUWithData; 464 /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */ 465#endif 466 467 /* is there an identical index block? */ 468 block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT); 469 470 /* 471 * get a folded value for [c..c+0x400[ and, 472 * if different from the value for the lead surrogate code point, 473 * set it for the lead surrogate code unit 474 */ 475 value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT); 476 if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) { 477 if(!utrie_set32(trie, U16_LEAD(c), value)) { 478 /* data table overflow */ 479 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 480 return; 481 } 482 483 /* if we did not find an identical index block... */ 484 if(block==indexLength) { 485 /* move the actual index (stage 1) entries from the supplementary position to the new one */ 486 uprv_memmove(idx+indexLength, 487 idx+(c>>UTRIE_SHIFT), 488 4*UTRIE_SURROGATE_BLOCK_COUNT); 489 indexLength+=UTRIE_SURROGATE_BLOCK_COUNT; 490 } 491 } 492 c+=0x400; 493 } else { 494 c+=UTRIE_DATA_BLOCK_LENGTH; 495 } 496 } 497#ifdef UTRIE_DEBUG 498 if(countLeadCUWithData>0) { 499 printf("supplementary data for %d lead surrogates\n", countLeadCUWithData); 500 } 501#endif 502 503 /* 504 * index array overflow? 505 * This is to guarantee that a folding offset is of the form 506 * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023. 507 * If the index is too large, then n>=1024 and more than 10 bits are necessary. 508 * 509 * In fact, it can only ever become n==1024 with completely unfoldable data and 510 * the additional block of duplicated values for lead surrogates. 511 */ 512 if(indexLength>=UTRIE_MAX_INDEX_LENGTH) { 513 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 514 return; 515 } 516 517 /* 518 * make space for the lead surrogate index block and 519 * insert it between the BMP indexes and the folded ones 520 */ 521 uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT, 522 idx+UTRIE_BMP_INDEX_LENGTH, 523 4*(indexLength-UTRIE_BMP_INDEX_LENGTH)); 524 uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH, 525 leadIndexes, 526 4*UTRIE_SURROGATE_BLOCK_COUNT); 527 indexLength+=UTRIE_SURROGATE_BLOCK_COUNT; 528 529#ifdef UTRIE_DEBUG 530 printf("trie index count: BMP %ld all Unicode %ld folded %ld\n", 531 UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength); 532#endif 533 534 trie->indexLength=indexLength; 535} 536 537/* 538 * Set a value in the trie index map to indicate which data block 539 * is referenced and which one is not. 540 * utrie_compact() will remove data blocks that are not used at all. 541 * Set 542 * - 0 if it is used 543 * - -1 if it is not used 544 */ 545static void 546_findUnusedBlocks(UNewTrie *trie) { 547 int32_t i; 548 549 /* fill the entire map with "not used" */ 550 uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4); 551 552 /* mark each block that _is_ used with 0 */ 553 for(i=0; i<trie->indexLength; ++i) { 554 trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0; 555 } 556 557 /* never move the all-initial-value block 0 */ 558 trie->map[0]=0; 559} 560 561static int32_t 562_findSameDataBlock(const uint32_t *data, int32_t dataLength, 563 int32_t otherBlock, int32_t step) { 564 int32_t block; 565 566 /* ensure that we do not even partially get past dataLength */ 567 dataLength-=UTRIE_DATA_BLOCK_LENGTH; 568 569 for(block=0; block<=dataLength; block+=step) { 570 if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) { 571 return block; 572 } 573 } 574 return -1; 575} 576 577/* 578 * Compact a folded build-time trie. 579 * 580 * The compaction 581 * - removes blocks that are identical with earlier ones 582 * - overlaps adjacent blocks as much as possible (if overlap==TRUE) 583 * - moves blocks in steps of the data granularity 584 * - moves and overlaps blocks that overlap with multiple values in the overlap region 585 * 586 * It does not 587 * - try to move and overlap blocks that are not already adjacent 588 */ 589static void 590utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) { 591 int32_t i, start, newStart, overlapStart; 592 593 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 594 return; 595 } 596 597 /* valid, uncompacted trie? */ 598 if(trie==NULL) { 599 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 600 return; 601 } 602 if(trie->isCompacted) { 603 return; /* nothing left to do */ 604 } 605 606 /* compaction */ 607 608 /* initialize the index map with "block is used/unused" flags */ 609 _findUnusedBlocks(trie); 610 611 /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */ 612 if(trie->isLatin1Linear && UTRIE_SHIFT<=8) { 613 overlapStart=UTRIE_DATA_BLOCK_LENGTH+256; 614 } else { 615 overlapStart=UTRIE_DATA_BLOCK_LENGTH; 616 } 617 618 newStart=UTRIE_DATA_BLOCK_LENGTH; 619 for(start=newStart; start<trie->dataLength;) { 620 /* 621 * start: index of first entry of current block 622 * newStart: index where the current block is to be moved 623 * (right after current end of already-compacted data) 624 */ 625 626 /* skip blocks that are not used */ 627 if(trie->map[start>>UTRIE_SHIFT]<0) { 628 /* advance start to the next block */ 629 start+=UTRIE_DATA_BLOCK_LENGTH; 630 631 /* leave newStart with the previous block! */ 632 continue; 633 } 634 635 /* search for an identical block */ 636 if( start>=overlapStart && 637 (i=_findSameDataBlock(trie->data, newStart, start, 638 overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH)) 639 >=0 640 ) { 641 /* found an identical block, set the other block's index value for the current block */ 642 trie->map[start>>UTRIE_SHIFT]=i; 643 644 /* advance start to the next block */ 645 start+=UTRIE_DATA_BLOCK_LENGTH; 646 647 /* leave newStart with the previous block! */ 648 continue; 649 } 650 651 /* see if the beginning of this block can be overlapped with the end of the previous block */ 652 if(overlap && start>=overlapStart) { 653 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ 654 for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY; 655 i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i); 656 i-=UTRIE_DATA_GRANULARITY) {} 657 } else { 658 i=0; 659 } 660 661 if(i>0) { 662 /* some overlap */ 663 trie->map[start>>UTRIE_SHIFT]=newStart-i; 664 665 /* move the non-overlapping indexes to their new positions */ 666 start+=i; 667 for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) { 668 trie->data[newStart++]=trie->data[start++]; 669 } 670 } else if(newStart<start) { 671 /* no overlap, just move the indexes to their new positions */ 672 trie->map[start>>UTRIE_SHIFT]=newStart; 673 for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) { 674 trie->data[newStart++]=trie->data[start++]; 675 } 676 } else /* no overlap && newStart==start */ { 677 trie->map[start>>UTRIE_SHIFT]=start; 678 newStart+=UTRIE_DATA_BLOCK_LENGTH; 679 start=newStart; 680 } 681 } 682 683 /* now adjust the index (stage 1) table */ 684 for(i=0; i<trie->indexLength; ++i) { 685 trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]; 686 } 687 688#ifdef UTRIE_DEBUG 689 /* we saved some space */ 690 printf("compacting trie: count of 32-bit words %lu->%lu\n", 691 (long)trie->dataLength, (long)newStart); 692#endif 693 694 trie->dataLength=newStart; 695} 696 697/* serialization ------------------------------------------------------------ */ 698 699/* 700 * Default function for the folding value: 701 * Just store the offset (16 bits) if there is any non-initial-value entry. 702 * 703 * The offset parameter is never 0. 704 * Returning the offset itself is safe for UTRIE_SHIFT>=5 because 705 * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800 706 * which fits into 16-bit trie values; 707 * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases. 708 * 709 * Theoretically, it would be safer for all possible UTRIE_SHIFT including 710 * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS 711 * which would always result in a value of 0x40..0x43f 712 * (start/end 1k blocks of supplementary Unicode code points). 713 * However, this would be uglier, and would not work for some existing 714 * binary data file formats. 715 * 716 * Also, we do not plan to change UTRIE_SHIFT because it would change binary 717 * data file formats, and we would probably not make it smaller because of 718 * the then even larger BMP index length even for empty tries. 719 */ 720static uint32_t U_CALLCONV 721defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { 722 uint32_t value, initialValue; 723 UChar32 limit; 724 UBool inBlockZero; 725 726 initialValue=trie->data[0]; 727 limit=start+0x400; 728 while(start<limit) { 729 value=utrie_get32(trie, start, &inBlockZero); 730 if(inBlockZero) { 731 start+=UTRIE_DATA_BLOCK_LENGTH; 732 } else if(value!=initialValue) { 733 return (uint32_t)offset; 734 } else { 735 ++start; 736 } 737 } 738 return 0; 739} 740 741U_CAPI int32_t U_EXPORT2 742utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity, 743 UNewTrieGetFoldedValue *getFoldedValue, 744 UBool reduceTo16Bits, 745 UErrorCode *pErrorCode) { 746 UTrieHeader *header; 747 uint32_t *p; 748 uint16_t *dest16; 749 int32_t i, length; 750 uint8_t* data = NULL; 751 752 /* argument check */ 753 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 754 return 0; 755 } 756 757 if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) { 758 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 759 return 0; 760 } 761 if(getFoldedValue==NULL) { 762 getFoldedValue=defaultGetFoldedValue; 763 } 764 765 data = (uint8_t*)dt; 766 /* fold and compact if necessary, also checks that indexLength is within limits */ 767 if(!trie->isCompacted) { 768 /* compact once without overlap to improve folding */ 769 utrie_compact(trie, FALSE, pErrorCode); 770 771 /* fold the supplementary part of the index array */ 772 utrie_fold(trie, getFoldedValue, pErrorCode); 773 774 /* compact again with overlap for minimum data array length */ 775 utrie_compact(trie, TRUE, pErrorCode); 776 777 trie->isCompacted=TRUE; 778 if(U_FAILURE(*pErrorCode)) { 779 return 0; 780 } 781 } 782 783 /* is dataLength within limits? */ 784 if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) { 785 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 786 } 787 788 length=sizeof(UTrieHeader)+2*trie->indexLength; 789 if(reduceTo16Bits) { 790 length+=2*trie->dataLength; 791 } else { 792 length+=4*trie->dataLength; 793 } 794 795 if(length>capacity) { 796 return length; /* preflighting */ 797 } 798 799#ifdef UTRIE_DEBUG 800 printf("**UTrieLengths(serialize)** index:%6ld data:%6ld serialized:%6ld\n", 801 (long)trie->indexLength, (long)trie->dataLength, (long)length); 802#endif 803 804 /* set the header fields */ 805 header=(UTrieHeader *)data; 806 data+=sizeof(UTrieHeader); 807 808 header->signature=0x54726965; /* "Trie" */ 809 header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT); 810 811 if(!reduceTo16Bits) { 812 header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT; 813 } 814 if(trie->isLatin1Linear) { 815 header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR; 816 } 817 818 header->indexLength=trie->indexLength; 819 header->dataLength=trie->dataLength; 820 821 /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */ 822 if(reduceTo16Bits) { 823 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */ 824 p=(uint32_t *)trie->index; 825 dest16=(uint16_t *)data; 826 for(i=trie->indexLength; i>0; --i) { 827 *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT); 828 } 829 830 /* write 16-bit data values */ 831 p=trie->data; 832 for(i=trie->dataLength; i>0; --i) { 833 *dest16++=(uint16_t)*p++; 834 } 835 } else { 836 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */ 837 p=(uint32_t *)trie->index; 838 dest16=(uint16_t *)data; 839 for(i=trie->indexLength; i>0; --i) { 840 *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT); 841 } 842 843 /* write 32-bit data values */ 844 uprv_memcpy(dest16, trie->data, 4*(size_t)trie->dataLength); 845 } 846 847 return length; 848} 849 850/* inverse to defaultGetFoldedValue() */ 851U_CAPI int32_t U_EXPORT2 852utrie_defaultGetFoldingOffset(uint32_t data) { 853 return (int32_t)data; 854} 855 856U_CAPI int32_t U_EXPORT2 857utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) { 858 const UTrieHeader *header; 859 const uint16_t *p16; 860 uint32_t options; 861 862 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 863 return -1; 864 } 865 866 /* enough data for a trie header? */ 867 if(length<(int32_t)sizeof(UTrieHeader)) { 868 *pErrorCode=U_INVALID_FORMAT_ERROR; 869 return -1; 870 } 871 872 /* check the signature */ 873 header=(const UTrieHeader *)data; 874 if(header->signature!=0x54726965) { 875 *pErrorCode=U_INVALID_FORMAT_ERROR; 876 return -1; 877 } 878 879 /* get the options and check the shift values */ 880 options=header->options; 881 if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT || 882 ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT 883 ) { 884 *pErrorCode=U_INVALID_FORMAT_ERROR; 885 return -1; 886 } 887 trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0); 888 889 /* get the length values */ 890 trie->indexLength=header->indexLength; 891 trie->dataLength=header->dataLength; 892 893 length-=(int32_t)sizeof(UTrieHeader); 894 895 /* enough data for the index? */ 896 if(length<2*trie->indexLength) { 897 *pErrorCode=U_INVALID_FORMAT_ERROR; 898 return -1; 899 } 900 p16=(const uint16_t *)(header+1); 901 trie->index=p16; 902 p16+=trie->indexLength; 903 length-=2*trie->indexLength; 904 905 /* get the data */ 906 if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) { 907 if(length<4*trie->dataLength) { 908 *pErrorCode=U_INVALID_FORMAT_ERROR; 909 return -1; 910 } 911 trie->data32=(const uint32_t *)p16; 912 trie->initialValue=trie->data32[0]; 913 length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength; 914 } else { 915 if(length<2*trie->dataLength) { 916 *pErrorCode=U_INVALID_FORMAT_ERROR; 917 return -1; 918 } 919 920 /* the "data16" data is used via the index pointer */ 921 trie->data32=NULL; 922 trie->initialValue=trie->index[trie->indexLength]; 923 length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength; 924 } 925 926 trie->getFoldingOffset=utrie_defaultGetFoldingOffset; 927 928 return length; 929} 930 931U_CAPI int32_t U_EXPORT2 932utrie_unserializeDummy(UTrie *trie, 933 void *data, int32_t length, 934 uint32_t initialValue, uint32_t leadUnitValue, 935 UBool make16BitTrie, 936 UErrorCode *pErrorCode) { 937 uint16_t *p16; 938 int32_t actualLength, latin1Length, i, limit; 939 uint16_t block; 940 941 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 942 return -1; 943 } 944 945 /* calculate the actual size of the dummy trie data */ 946 947 /* max(Latin-1, block 0) */ 948 latin1Length= 256; /*UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;*/ 949 950 trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT; 951 trie->dataLength=latin1Length; 952 if(leadUnitValue!=initialValue) { 953 trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH; 954 } 955 956 actualLength=trie->indexLength*2; 957 if(make16BitTrie) { 958 actualLength+=trie->dataLength*2; 959 } else { 960 actualLength+=trie->dataLength*4; 961 } 962 963 /* enough space for the dummy trie? */ 964 if(length<actualLength) { 965 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 966 return actualLength; 967 } 968 969 trie->isLatin1Linear=TRUE; 970 trie->initialValue=initialValue; 971 972 /* fill the index and data arrays */ 973 p16=(uint16_t *)data; 974 trie->index=p16; 975 976 if(make16BitTrie) { 977 /* indexes to block 0 */ 978 block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT); 979 limit=trie->indexLength; 980 for(i=0; i<limit; ++i) { 981 p16[i]=block; 982 } 983 984 if(leadUnitValue!=initialValue) { 985 /* indexes for lead surrogate code units to the block after Latin-1 */ 986 block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT); 987 i=0xd800>>UTRIE_SHIFT; 988 limit=0xdc00>>UTRIE_SHIFT; 989 for(; i<limit; ++i) { 990 p16[i]=block; 991 } 992 } 993 994 trie->data32=NULL; 995 996 /* Latin-1 data */ 997 p16+=trie->indexLength; 998 for(i=0; i<latin1Length; ++i) { 999 p16[i]=(uint16_t)initialValue; 1000 } 1001 1002 /* data for lead surrogate code units */ 1003 if(leadUnitValue!=initialValue) { 1004 limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH; 1005 for(/* i=latin1Length */; i<limit; ++i) { 1006 p16[i]=(uint16_t)leadUnitValue; 1007 } 1008 } 1009 } else { 1010 uint32_t *p32; 1011 1012 /* indexes to block 0 */ 1013 uprv_memset(p16, 0, trie->indexLength*2); 1014 1015 if(leadUnitValue!=initialValue) { 1016 /* indexes for lead surrogate code units to the block after Latin-1 */ 1017 block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT); 1018 i=0xd800>>UTRIE_SHIFT; 1019 limit=0xdc00>>UTRIE_SHIFT; 1020 for(; i<limit; ++i) { 1021 p16[i]=block; 1022 } 1023 } 1024 1025 trie->data32=p32=(uint32_t *)(p16+trie->indexLength); 1026 1027 /* Latin-1 data */ 1028 for(i=0; i<latin1Length; ++i) { 1029 p32[i]=initialValue; 1030 } 1031 1032 /* data for lead surrogate code units */ 1033 if(leadUnitValue!=initialValue) { 1034 limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH; 1035 for(/* i=latin1Length */; i<limit; ++i) { 1036 p32[i]=leadUnitValue; 1037 } 1038 } 1039 } 1040 1041 trie->getFoldingOffset=utrie_defaultGetFoldingOffset; 1042 1043 return actualLength; 1044} 1045 1046/* enumeration -------------------------------------------------------------- */ 1047 1048/* default UTrieEnumValue() returns the input value itself */ 1049static uint32_t U_CALLCONV 1050enumSameValue(const void * /*context*/, uint32_t value) { 1051 return value; 1052} 1053 1054/** 1055 * Enumerate all ranges of code points with the same relevant values. 1056 * The values are transformed from the raw trie entries by the enumValue function. 1057 */ 1058U_CAPI void U_EXPORT2 1059utrie_enum(const UTrie *trie, 1060 UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) { 1061 const uint32_t *data32; 1062 const uint16_t *idx; 1063 1064 uint32_t value, prevValue, initialValue; 1065 UChar32 c, prev; 1066 int32_t l, i, j, block, prevBlock, nullBlock, offset; 1067 1068 /* check arguments */ 1069 if(trie==NULL || trie->index==NULL || enumRange==NULL) { 1070 return; 1071 } 1072 if(enumValue==NULL) { 1073 enumValue=enumSameValue; 1074 } 1075 1076 idx=trie->index; 1077 data32=trie->data32; 1078 1079 /* get the enumeration value that corresponds to an initial-value trie data entry */ 1080 initialValue=enumValue(context, trie->initialValue); 1081 1082 if(data32==NULL) { 1083 nullBlock=trie->indexLength; 1084 } else { 1085 nullBlock=0; 1086 } 1087 1088 /* set variables for previous range */ 1089 prevBlock=nullBlock; 1090 prev=0; 1091 prevValue=initialValue; 1092 1093 /* enumerate BMP - the main loop enumerates data blocks */ 1094 for(i=0, c=0; c<=0xffff; ++i) { 1095 if(c==0xd800) { 1096 /* skip lead surrogate code _units_, go to lead surr. code _points_ */ 1097 i=UTRIE_BMP_INDEX_LENGTH; 1098 } else if(c==0xdc00) { 1099 /* go back to regular BMP code points */ 1100 i=c>>UTRIE_SHIFT; 1101 } 1102 1103 block=idx[i]<<UTRIE_INDEX_SHIFT; 1104 if(block==prevBlock) { 1105 /* the block is the same as the previous one, and filled with value */ 1106 c+=UTRIE_DATA_BLOCK_LENGTH; 1107 } else if(block==nullBlock) { 1108 /* this is the all-initial-value block */ 1109 if(prevValue!=initialValue) { 1110 if(prev<c) { 1111 if(!enumRange(context, prev, c, prevValue)) { 1112 return; 1113 } 1114 } 1115 prevBlock=nullBlock; 1116 prev=c; 1117 prevValue=initialValue; 1118 } 1119 c+=UTRIE_DATA_BLOCK_LENGTH; 1120 } else { 1121 prevBlock=block; 1122 for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) { 1123 value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]); 1124 if(value!=prevValue) { 1125 if(prev<c) { 1126 if(!enumRange(context, prev, c, prevValue)) { 1127 return; 1128 } 1129 } 1130 if(j>0) { 1131 /* the block is not filled with all the same value */ 1132 prevBlock=-1; 1133 } 1134 prev=c; 1135 prevValue=value; 1136 } 1137 ++c; 1138 } 1139 } 1140 } 1141 1142 /* enumerate supplementary code points */ 1143 for(l=0xd800; l<0xdc00;) { 1144 /* lead surrogate access */ 1145 offset=idx[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT; 1146 if(offset==nullBlock) { 1147 /* no entries for a whole block of lead surrogates */ 1148 if(prevValue!=initialValue) { 1149 if(prev<c) { 1150 if(!enumRange(context, prev, c, prevValue)) { 1151 return; 1152 } 1153 } 1154 prevBlock=nullBlock; 1155 prev=c; 1156 prevValue=initialValue; 1157 } 1158 1159 l+=UTRIE_DATA_BLOCK_LENGTH; 1160 c+=UTRIE_DATA_BLOCK_LENGTH<<10; 1161 continue; 1162 } 1163 1164 value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : idx[offset+(l&UTRIE_MASK)]; 1165 1166 /* enumerate trail surrogates for this lead surrogate */ 1167 offset=trie->getFoldingOffset(value); 1168 if(offset<=0) { 1169 /* no data for this lead surrogate */ 1170 if(prevValue!=initialValue) { 1171 if(prev<c) { 1172 if(!enumRange(context, prev, c, prevValue)) { 1173 return; 1174 } 1175 } 1176 prevBlock=nullBlock; 1177 prev=c; 1178 prevValue=initialValue; 1179 } 1180 1181 /* nothing else to do for the supplementary code points for this lead surrogate */ 1182 c+=0x400; 1183 } else { 1184 /* enumerate code points for this lead surrogate */ 1185 i=offset; 1186 offset+=UTRIE_SURROGATE_BLOCK_COUNT; 1187 do { 1188 /* copy of most of the body of the BMP loop */ 1189 block=idx[i]<<UTRIE_INDEX_SHIFT; 1190 if(block==prevBlock) { 1191 /* the block is the same as the previous one, and filled with value */ 1192 c+=UTRIE_DATA_BLOCK_LENGTH; 1193 } else if(block==nullBlock) { 1194 /* this is the all-initial-value block */ 1195 if(prevValue!=initialValue) { 1196 if(prev<c) { 1197 if(!enumRange(context, prev, c, prevValue)) { 1198 return; 1199 } 1200 } 1201 prevBlock=nullBlock; 1202 prev=c; 1203 prevValue=initialValue; 1204 } 1205 c+=UTRIE_DATA_BLOCK_LENGTH; 1206 } else { 1207 prevBlock=block; 1208 for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) { 1209 value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]); 1210 if(value!=prevValue) { 1211 if(prev<c) { 1212 if(!enumRange(context, prev, c, prevValue)) { 1213 return; 1214 } 1215 } 1216 if(j>0) { 1217 /* the block is not filled with all the same value */ 1218 prevBlock=-1; 1219 } 1220 prev=c; 1221 prevValue=value; 1222 } 1223 ++c; 1224 } 1225 } 1226 } while(++i<offset); 1227 } 1228 1229 ++l; 1230 } 1231 1232 /* deliver last range */ 1233 enumRange(context, prev, c, prevValue); 1234} 1235