1/* 2****************************************************************************** 3* 4* Copyright (C) 2001-2011, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7****************************************************************************** 8* file name: utrie2_builder.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2008sep26 (split off from utrie2.c) 14* created by: Markus W. Scherer 15* 16* This is a common implementation of a Unicode trie. 17* It is a kind of compressed, serializable table of 16- or 32-bit values associated with 18* Unicode code points (0..0x10ffff). 19* This is the second common version of a Unicode trie (hence the name UTrie2). 20* See utrie2.h for a comparison. 21* 22* This file contains only the builder code. 23* See utrie2.c for the runtime and enumeration code. 24*/ 25#ifdef UTRIE2_DEBUG 26# include <stdio.h> 27#endif 28 29#include "unicode/utypes.h" 30#include "cmemory.h" 31#include "utrie2.h" 32#include "utrie2_impl.h" 33 34#include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */ 35 36#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 37 38/* Implementation notes ----------------------------------------------------- */ 39 40/* 41 * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values 42 * have been chosen to minimize trie sizes overall. 43 * Most of the code is flexible enough to work with a range of values, 44 * within certain limits. 45 * 46 * Exception: Support for separate values for lead surrogate code _units_ 47 * vs. code _points_ was added after the constants were fixed, 48 * and has not been tested nor particularly designed for different constant values. 49 * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 50 * part and back.) 51 * 52 * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data 53 * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH 54 * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction 55 * remapping) stops working. 56 * 57 * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() 58 * assumes that a single index-2 block is used for 0x400 code points 59 * corresponding to one lead surrogate. 60 * 61 * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains 62 * more than one Unicode plane, and the split of the index-2 table into a BMP 63 * part and a supplementary part, with a gap in between, would not work. 64 * 65 * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because 66 * there is data with more than 64k distinct values, 67 * for example for Unihan collation with a separate collation weight per 68 * Han character. 69 */ 70 71/* Building a trie ----------------------------------------------------------*/ 72 73enum { 74 /** The null index-2 block, following the gap in the index-2 table. */ 75 UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH, 76 77 /** The start of allocated index-2 blocks. */ 78 UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH, 79 80 /** 81 * The null data block. 82 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller, 83 * to work with 6-bit trail bytes from 2-byte UTF-8. 84 */ 85 UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET, 86 87 /** The start of allocated data blocks. */ 88 UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40, 89 90 /** 91 * The start of data blocks for U+0800 and above. 92 * Below, compaction uses a block length of 64 for 2-byte UTF-8. 93 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH. 94 * Data values for 0x780 code points beyond ASCII. 95 */ 96 UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780 97}; 98 99/* Start with allocation of 16k data entries. */ 100#define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14) 101 102/* Grow about 8x each time. */ 103#define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17) 104 105static int32_t 106allocIndex2Block(UNewTrie2 *trie); 107 108U_CAPI UTrie2 * U_EXPORT2 109utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { 110 UTrie2 *trie; 111 UNewTrie2 *newTrie; 112 uint32_t *data; 113 int32_t i, j; 114 115 if(U_FAILURE(*pErrorCode)) { 116 return NULL; 117 } 118 119 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); 120 newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); 121 data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4); 122 if(trie==NULL || newTrie==NULL || data==NULL) { 123 uprv_free(trie); 124 uprv_free(newTrie); 125 uprv_free(data); 126 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 127 return 0; 128 } 129 130 uprv_memset(trie, 0, sizeof(UTrie2)); 131 trie->initialValue=initialValue; 132 trie->errorValue=errorValue; 133 trie->highStart=0x110000; 134 trie->newTrie=newTrie; 135 136 newTrie->data=data; 137 newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH; 138 newTrie->initialValue=initialValue; 139 newTrie->errorValue=errorValue; 140 newTrie->highStart=0x110000; 141 newTrie->firstFreeBlock=0; /* no free block in the list */ 142 newTrie->isCompacted=FALSE; 143 144 /* 145 * preallocate and reset 146 * - ASCII 147 * - the bad-UTF-8-data block 148 * - the null data block 149 */ 150 for(i=0; i<0x80; ++i) { 151 newTrie->data[i]=initialValue; 152 } 153 for(; i<0xc0; ++i) { 154 newTrie->data[i]=errorValue; 155 } 156 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) { 157 newTrie->data[i]=initialValue; 158 } 159 newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET; 160 newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET; 161 162 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */ 163 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 164 newTrie->index2[i]=j; 165 newTrie->map[i]=1; 166 } 167 /* reference counts for the bad-UTF-8-data block */ 168 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 169 newTrie->map[i]=0; 170 } 171 /* 172 * Reference counts for the null data block: all blocks except for the ASCII blocks. 173 * Plus 1 so that we don't drop this block during compaction. 174 * Plus as many as needed for lead surrogate code points. 175 */ 176 /* i==newTrie->dataNullOffset */ 177 newTrie->map[i++]= 178 (0x110000>>UTRIE2_SHIFT_2)- 179 (0x80>>UTRIE2_SHIFT_2)+ 180 1+ 181 UTRIE2_LSCP_INDEX_2_LENGTH; 182 j+=UTRIE2_DATA_BLOCK_LENGTH; 183 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 184 newTrie->map[i]=0; 185 } 186 187 /* 188 * set the remaining indexes in the BMP index-2 block 189 * to the null data block 190 */ 191 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { 192 newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET; 193 } 194 195 /* 196 * Fill the index gap with impossible values so that compaction 197 * does not overlap other index-2 blocks with the gap. 198 */ 199 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) { 200 newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1; 201 } 202 203 /* set the indexes in the null index-2 block */ 204 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) { 205 newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET; 206 } 207 newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET; 208 newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET; 209 210 /* set the index-1 indexes for the linear index-2 block */ 211 for(i=0, j=0; 212 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; 213 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH 214 ) { 215 newTrie->index1[i]=j; 216 } 217 218 /* set the remaining index-1 indexes to the null index-2 block */ 219 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { 220 newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET; 221 } 222 223 /* 224 * Preallocate and reset data for U+0080..U+07ff, 225 * for 2-byte UTF-8 which will be compacted in 64-blocks 226 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller. 227 */ 228 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) { 229 utrie2_set32(trie, i, initialValue, pErrorCode); 230 } 231 232 return trie; 233} 234 235static UNewTrie2 * 236cloneBuilder(const UNewTrie2 *other) { 237 UNewTrie2 *trie; 238 239 trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); 240 if(trie==NULL) { 241 return NULL; 242 } 243 244 trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4); 245 if(trie->data==NULL) { 246 uprv_free(trie); 247 return NULL; 248 } 249 trie->dataCapacity=other->dataCapacity; 250 251 /* clone data */ 252 uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1)); 253 uprv_memcpy(trie->index2, other->index2, other->index2Length*4); 254 trie->index2NullOffset=other->index2NullOffset; 255 trie->index2Length=other->index2Length; 256 257 uprv_memcpy(trie->data, other->data, other->dataLength*4); 258 trie->dataNullOffset=other->dataNullOffset; 259 trie->dataLength=other->dataLength; 260 261 /* reference counters */ 262 if(other->isCompacted) { 263 trie->firstFreeBlock=0; 264 } else { 265 uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4); 266 trie->firstFreeBlock=other->firstFreeBlock; 267 } 268 269 trie->initialValue=other->initialValue; 270 trie->errorValue=other->errorValue; 271 trie->highStart=other->highStart; 272 trie->isCompacted=other->isCompacted; 273 274 return trie; 275} 276 277U_CAPI UTrie2 * U_EXPORT2 278utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { 279 UTrie2 *trie; 280 281 if(U_FAILURE(*pErrorCode)) { 282 return NULL; 283 } 284 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { 285 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 286 return NULL; 287 } 288 289 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); 290 if(trie==NULL) { 291 return NULL; 292 } 293 uprv_memcpy(trie, other, sizeof(UTrie2)); 294 295 if(other->memory!=NULL) { 296 trie->memory=uprv_malloc(other->length); 297 if(trie->memory!=NULL) { 298 trie->isMemoryOwned=TRUE; 299 uprv_memcpy(trie->memory, other->memory, other->length); 300 301 /* make the clone's pointers point to its own memory */ 302 trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory); 303 if(other->data16!=NULL) { 304 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory); 305 } 306 if(other->data32!=NULL) { 307 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory); 308 } 309 } 310 } else /* other->newTrie!=NULL */ { 311 trie->newTrie=cloneBuilder(other->newTrie); 312 } 313 314 if(trie->memory==NULL && trie->newTrie==NULL) { 315 uprv_free(trie); 316 trie=NULL; 317 } 318 return trie; 319} 320 321typedef struct NewTrieAndStatus { 322 UTrie2 *trie; 323 UErrorCode errorCode; 324 UBool exclusiveLimit; /* rather than inclusive range end */ 325} NewTrieAndStatus; 326 327static UBool U_CALLCONV 328copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 329 NewTrieAndStatus *nt=(NewTrieAndStatus *)context; 330 if(value!=nt->trie->initialValue) { 331 if(nt->exclusiveLimit) { 332 --end; 333 } 334 if(start==end) { 335 utrie2_set32(nt->trie, start, value, &nt->errorCode); 336 } else { 337 utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode); 338 } 339 return U_SUCCESS(nt->errorCode); 340 } else { 341 return TRUE; 342 } 343} 344 345#ifdef UTRIE2_DEBUG 346static void 347utrie_printLengths(const UTrie *trie) { 348 long indexLength=trie->indexLength; 349 long dataLength=(long)trie->dataLength; 350 long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); 351 printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", 352 indexLength, dataLength, totalLength); 353} 354 355static void 356utrie2_printLengths(const UTrie2 *trie, const char *which) { 357 long indexLength=trie->indexLength; 358 long dataLength=(long)trie->dataLength; 359 long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); 360 printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n", 361 which, indexLength, dataLength, totalLength); 362} 363#endif 364 365U_CAPI UTrie2 * U_EXPORT2 366utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { 367 NewTrieAndStatus context; 368 UChar lead; 369 370 if(U_FAILURE(*pErrorCode)) { 371 return NULL; 372 } 373 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { 374 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 375 return NULL; 376 } 377 if(other->newTrie!=NULL && !other->newTrie->isCompacted) { 378 return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */ 379 } 380 381 /* Clone the frozen trie by enumerating it and building a new one. */ 382 context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode); 383 if(U_FAILURE(*pErrorCode)) { 384 return NULL; 385 } 386 context.exclusiveLimit=FALSE; 387 context.errorCode=*pErrorCode; 388 utrie2_enum(other, NULL, copyEnumRange, &context); 389 *pErrorCode=context.errorCode; 390 for(lead=0xd800; lead<0xdc00; ++lead) { 391 uint32_t value; 392 if(other->data32==NULL) { 393 value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead); 394 } else { 395 value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead); 396 } 397 if(value!=other->initialValue) { 398 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); 399 } 400 } 401 if(U_FAILURE(*pErrorCode)) { 402 utrie2_close(context.trie); 403 context.trie=NULL; 404 } 405 return context.trie; 406} 407 408/* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */ 409U_CAPI UTrie2 * U_EXPORT2 410utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) { 411 NewTrieAndStatus context; 412 UChar lead; 413 414 if(U_FAILURE(*pErrorCode)) { 415 return NULL; 416 } 417 if(trie1==NULL) { 418 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 419 return NULL; 420 } 421 context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode); 422 if(U_FAILURE(*pErrorCode)) { 423 return NULL; 424 } 425 context.exclusiveLimit=TRUE; 426 context.errorCode=*pErrorCode; 427 utrie_enum(trie1, NULL, copyEnumRange, &context); 428 *pErrorCode=context.errorCode; 429 for(lead=0xd800; lead<0xdc00; ++lead) { 430 uint32_t value; 431 if(trie1->data32==NULL) { 432 value=UTRIE_GET16_FROM_LEAD(trie1, lead); 433 } else { 434 value=UTRIE_GET32_FROM_LEAD(trie1, lead); 435 } 436 if(value!=trie1->initialValue) { 437 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); 438 } 439 } 440 if(U_SUCCESS(*pErrorCode)) { 441 utrie2_freeze(context.trie, 442 trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS, 443 pErrorCode); 444 } 445#ifdef UTRIE2_DEBUG 446 if(U_SUCCESS(*pErrorCode)) { 447 utrie_printLengths(trie1); 448 utrie2_printLengths(context.trie, "fromUTrie"); 449 } 450#endif 451 if(U_FAILURE(*pErrorCode)) { 452 utrie2_close(context.trie); 453 context.trie=NULL; 454 } 455 return context.trie; 456} 457 458static inline UBool 459isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 460 int32_t i2, block; 461 462 if(U_IS_LEAD(c) && forLSCP) { 463 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ 464 (c>>UTRIE2_SHIFT_2); 465 } else { 466 i2=trie->index1[c>>UTRIE2_SHIFT_1]+ 467 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); 468 } 469 block=trie->index2[i2]; 470 return (UBool)(block==trie->dataNullOffset); 471} 472 473static int32_t 474allocIndex2Block(UNewTrie2 *trie) { 475 int32_t newBlock, newTop; 476 477 newBlock=trie->index2Length; 478 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH; 479 if(newTop>LENGTHOF(trie->index2)) { 480 /* 481 * Should never occur. 482 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect, 483 * or the code writes more values than should be possible. 484 */ 485 return -1; 486 } 487 trie->index2Length=newTop; 488 uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4); 489 return newBlock; 490} 491 492static int32_t 493getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 494 int32_t i1, i2; 495 496 if(U_IS_LEAD(c) && forLSCP) { 497 return UTRIE2_LSCP_INDEX_2_OFFSET; 498 } 499 500 i1=c>>UTRIE2_SHIFT_1; 501 i2=trie->index1[i1]; 502 if(i2==trie->index2NullOffset) { 503 i2=allocIndex2Block(trie); 504 if(i2<0) { 505 return -1; /* program error */ 506 } 507 trie->index1[i1]=i2; 508 } 509 return i2; 510} 511 512static int32_t 513allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { 514 int32_t newBlock, newTop; 515 516 if(trie->firstFreeBlock!=0) { 517 /* get the first free block */ 518 newBlock=trie->firstFreeBlock; 519 trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2]; 520 } else { 521 /* get a new block from the high end */ 522 newBlock=trie->dataLength; 523 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH; 524 if(newTop>trie->dataCapacity) { 525 /* out of memory in the data array */ 526 int32_t capacity; 527 uint32_t *data; 528 529 if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) { 530 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH; 531 } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) { 532 capacity=UNEWTRIE2_MAX_DATA_LENGTH; 533 } else { 534 /* 535 * Should never occur. 536 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect, 537 * or the code writes more values than should be possible. 538 */ 539 return -1; 540 } 541 data=(uint32_t *)uprv_malloc(capacity*4); 542 if(data==NULL) { 543 return -1; 544 } 545 uprv_memcpy(data, trie->data, trie->dataLength*4); 546 uprv_free(trie->data); 547 trie->data=data; 548 trie->dataCapacity=capacity; 549 } 550 trie->dataLength=newTop; 551 } 552 uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4); 553 trie->map[newBlock>>UTRIE2_SHIFT_2]=0; 554 return newBlock; 555} 556 557/* call when the block's reference counter reaches 0 */ 558static void 559releaseDataBlock(UNewTrie2 *trie, int32_t block) { 560 /* put this block at the front of the free-block chain */ 561 trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock; 562 trie->firstFreeBlock=block; 563} 564 565static inline UBool 566isWritableBlock(UNewTrie2 *trie, int32_t block) { 567 return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]); 568} 569 570static inline void 571setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { 572 int32_t oldBlock; 573 ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */ 574 oldBlock=trie->index2[i2]; 575 if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) { 576 releaseDataBlock(trie, oldBlock); 577 } 578 trie->index2[i2]=block; 579} 580 581/** 582 * No error checking for illegal arguments. 583 * 584 * @return -1 if no new data block available (out of memory in data array) 585 * @internal 586 */ 587static int32_t 588getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 589 int32_t i2, oldBlock, newBlock; 590 591 i2=getIndex2Block(trie, c, forLSCP); 592 if(i2<0) { 593 return -1; /* program error */ 594 } 595 596 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; 597 oldBlock=trie->index2[i2]; 598 if(isWritableBlock(trie, oldBlock)) { 599 return oldBlock; 600 } 601 602 /* allocate a new data block */ 603 newBlock=allocDataBlock(trie, oldBlock); 604 if(newBlock<0) { 605 /* out of memory in the data array */ 606 return -1; 607 } 608 setIndex2Entry(trie, i2, newBlock); 609 return newBlock; 610} 611 612/** 613 * @return TRUE if the value was successfully set 614 */ 615static void 616set32(UNewTrie2 *trie, 617 UChar32 c, UBool forLSCP, uint32_t value, 618 UErrorCode *pErrorCode) { 619 int32_t block; 620 621 if(trie==NULL || trie->isCompacted) { 622 *pErrorCode=U_NO_WRITE_PERMISSION; 623 return; 624 } 625 626 block=getDataBlock(trie, c, forLSCP); 627 if(block<0) { 628 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 629 return; 630 } 631 632 trie->data[block+(c&UTRIE2_DATA_MASK)]=value; 633} 634 635U_CAPI void U_EXPORT2 636utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { 637 if(U_FAILURE(*pErrorCode)) { 638 return; 639 } 640 if((uint32_t)c>0x10ffff) { 641 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 642 return; 643 } 644 set32(trie->newTrie, c, TRUE, value, pErrorCode); 645} 646 647U_CAPI void U_EXPORT2 648utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, 649 UChar32 c, uint32_t value, 650 UErrorCode *pErrorCode) { 651 if(U_FAILURE(*pErrorCode)) { 652 return; 653 } 654 if(!U_IS_LEAD(c)) { 655 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 656 return; 657 } 658 set32(trie->newTrie, c, FALSE, value, pErrorCode); 659} 660 661static void 662writeBlock(uint32_t *block, uint32_t value) { 663 uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH; 664 while(block<limit) { 665 *block++=value; 666 } 667} 668 669/** 670 * initialValue is ignored if overwrite=TRUE 671 * @internal 672 */ 673static void 674fillBlock(uint32_t *block, UChar32 start, UChar32 limit, 675 uint32_t value, uint32_t initialValue, UBool overwrite) { 676 uint32_t *pLimit; 677 678 pLimit=block+limit; 679 block+=start; 680 if(overwrite) { 681 while(block<pLimit) { 682 *block++=value; 683 } 684 } else { 685 while(block<pLimit) { 686 if(*block==initialValue) { 687 *block=value; 688 } 689 ++block; 690 } 691 } 692} 693 694U_CAPI void U_EXPORT2 695utrie2_setRange32(UTrie2 *trie, 696 UChar32 start, UChar32 end, 697 uint32_t value, UBool overwrite, 698 UErrorCode *pErrorCode) { 699 /* 700 * repeat value in [start..end] 701 * mark index values for repeat-data blocks by setting bit 31 of the index values 702 * fill around existing values if any, if(overwrite) 703 */ 704 UNewTrie2 *newTrie; 705 int32_t block, rest, repeatBlock; 706 UChar32 limit; 707 708 if(U_FAILURE(*pErrorCode)) { 709 return; 710 } 711 if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) { 712 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 713 return; 714 } 715 newTrie=trie->newTrie; 716 if(newTrie==NULL || newTrie->isCompacted) { 717 *pErrorCode=U_NO_WRITE_PERMISSION; 718 return; 719 } 720 if(!overwrite && value==newTrie->initialValue) { 721 return; /* nothing to do */ 722 } 723 724 limit=end+1; 725 if(start&UTRIE2_DATA_MASK) { 726 UChar32 nextStart; 727 728 /* set partial block at [start..following block boundary[ */ 729 block=getDataBlock(newTrie, start, TRUE); 730 if(block<0) { 731 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 732 return; 733 } 734 735 nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK; 736 if(nextStart<=limit) { 737 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH, 738 value, newTrie->initialValue, overwrite); 739 start=nextStart; 740 } else { 741 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK, 742 value, newTrie->initialValue, overwrite); 743 return; 744 } 745 } 746 747 /* number of positions in the last, partial block */ 748 rest=limit&UTRIE2_DATA_MASK; 749 750 /* round down limit to a block boundary */ 751 limit&=~UTRIE2_DATA_MASK; 752 753 /* iterate over all-value blocks */ 754 if(value==newTrie->initialValue) { 755 repeatBlock=newTrie->dataNullOffset; 756 } else { 757 repeatBlock=-1; 758 } 759 760 while(start<limit) { 761 int32_t i2; 762 UBool setRepeatBlock=FALSE; 763 764 if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) { 765 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */ 766 continue; 767 } 768 769 /* get index value */ 770 i2=getIndex2Block(newTrie, start, TRUE); 771 if(i2<0) { 772 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 773 return; 774 } 775 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; 776 block=newTrie->index2[i2]; 777 if(isWritableBlock(newTrie, block)) { 778 /* already allocated */ 779 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) { 780 /* 781 * We overwrite all values, and it's not a 782 * protected (ASCII-linear or 2-byte UTF-8) block: 783 * replace with the repeatBlock. 784 */ 785 setRepeatBlock=TRUE; 786 } else { 787 /* !overwrite, or protected block: just write the values into this block */ 788 fillBlock(newTrie->data+block, 789 0, UTRIE2_DATA_BLOCK_LENGTH, 790 value, newTrie->initialValue, overwrite); 791 } 792 } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) { 793 /* 794 * Set the repeatBlock instead of the null block or previous repeat block: 795 * 796 * If !isWritableBlock() then all entries in the block have the same value 797 * because it's the null block or a range block (the repeatBlock from a previous 798 * call to utrie2_setRange32()). 799 * No other blocks are used multiple times before compacting. 800 * 801 * The null block is the only non-writable block with the initialValue because 802 * of the repeatBlock initialization above. (If value==initialValue, then 803 * the repeatBlock will be the null data block.) 804 * 805 * We set our repeatBlock if the desired value differs from the block's value, 806 * and if we overwrite any data or if the data is all initial values 807 * (which is the same as the block being the null block, see above). 808 */ 809 setRepeatBlock=TRUE; 810 } 811 if(setRepeatBlock) { 812 if(repeatBlock>=0) { 813 setIndex2Entry(newTrie, i2, repeatBlock); 814 } else { 815 /* create and set and fill the repeatBlock */ 816 repeatBlock=getDataBlock(newTrie, start, TRUE); 817 if(repeatBlock<0) { 818 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 819 return; 820 } 821 writeBlock(newTrie->data+repeatBlock, value); 822 } 823 } 824 825 start+=UTRIE2_DATA_BLOCK_LENGTH; 826 } 827 828 if(rest>0) { 829 /* set partial block at [last block boundary..limit[ */ 830 block=getDataBlock(newTrie, start, TRUE); 831 if(block<0) { 832 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 833 return; 834 } 835 836 fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite); 837 } 838 839 return; 840} 841 842/* compaction --------------------------------------------------------------- */ 843 844static inline UBool 845equal_int32(const int32_t *s, const int32_t *t, int32_t length) { 846 while(length>0 && *s==*t) { 847 ++s; 848 ++t; 849 --length; 850 } 851 return (UBool)(length==0); 852} 853 854static inline UBool 855equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { 856 while(length>0 && *s==*t) { 857 ++s; 858 ++t; 859 --length; 860 } 861 return (UBool)(length==0); 862} 863 864static int32_t 865findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) { 866 int32_t block; 867 868 /* ensure that we do not even partially get past index2Length */ 869 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH; 870 871 for(block=0; block<=index2Length; ++block) { 872 if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) { 873 return block; 874 } 875 } 876 return -1; 877} 878 879static int32_t 880findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) { 881 int32_t block; 882 883 /* ensure that we do not even partially get past dataLength */ 884 dataLength-=blockLength; 885 886 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) { 887 if(equal_uint32(data+block, data+otherBlock, blockLength)) { 888 return block; 889 } 890 } 891 return -1; 892} 893 894/* 895 * Find the start of the last range in the trie by enumerating backward. 896 * Indexes for supplementary code points higher than this will be omitted. 897 */ 898static UChar32 899findHighStart(UNewTrie2 *trie, uint32_t highValue) { 900 const uint32_t *data32; 901 902 uint32_t value, initialValue; 903 UChar32 c, prev; 904 int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; 905 906 data32=trie->data; 907 initialValue=trie->initialValue; 908 909 index2NullOffset=trie->index2NullOffset; 910 nullBlock=trie->dataNullOffset; 911 912 /* set variables for previous range */ 913 if(highValue==initialValue) { 914 prevI2Block=index2NullOffset; 915 prevBlock=nullBlock; 916 } else { 917 prevI2Block=-1; 918 prevBlock=-1; 919 } 920 prev=0x110000; 921 922 /* enumerate index-2 blocks */ 923 i1=UNEWTRIE2_INDEX_1_LENGTH; 924 c=prev; 925 while(c>0) { 926 i2Block=trie->index1[--i1]; 927 if(i2Block==prevI2Block) { 928 /* the index-2 block is the same as the previous one, and filled with highValue */ 929 c-=UTRIE2_CP_PER_INDEX_1_ENTRY; 930 continue; 931 } 932 prevI2Block=i2Block; 933 if(i2Block==index2NullOffset) { 934 /* this is the null index-2 block */ 935 if(highValue!=initialValue) { 936 return c; 937 } 938 c-=UTRIE2_CP_PER_INDEX_1_ENTRY; 939 } else { 940 /* enumerate data blocks for one index-2 block */ 941 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) { 942 block=trie->index2[i2Block+ --i2]; 943 if(block==prevBlock) { 944 /* the block is the same as the previous one, and filled with highValue */ 945 c-=UTRIE2_DATA_BLOCK_LENGTH; 946 continue; 947 } 948 prevBlock=block; 949 if(block==nullBlock) { 950 /* this is the null data block */ 951 if(highValue!=initialValue) { 952 return c; 953 } 954 c-=UTRIE2_DATA_BLOCK_LENGTH; 955 } else { 956 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) { 957 value=data32[block+ --j]; 958 if(value!=highValue) { 959 return c; 960 } 961 --c; 962 } 963 } 964 } 965 } 966 } 967 968 /* deliver last range */ 969 return 0; 970} 971 972/* 973 * Compact a build-time trie. 974 * 975 * The compaction 976 * - removes blocks that are identical with earlier ones 977 * - overlaps adjacent blocks as much as possible (if overlap==TRUE) 978 * - moves blocks in steps of the data granularity 979 * - moves and overlaps blocks that overlap with multiple values in the overlap region 980 * 981 * It does not 982 * - try to move and overlap blocks that are not already adjacent 983 */ 984static void 985compactData(UNewTrie2 *trie) { 986 int32_t start, newStart, movedStart; 987 int32_t blockLength, overlap; 988 int32_t i, mapIndex, blockCount; 989 990 /* do not compact linear-ASCII data */ 991 newStart=UTRIE2_DATA_START_OFFSET; 992 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) { 993 trie->map[i]=start; 994 } 995 996 /* 997 * Start with a block length of 64 for 2-byte UTF-8, 998 * then switch to UTRIE2_DATA_BLOCK_LENGTH. 999 */ 1000 blockLength=64; 1001 blockCount=blockLength>>UTRIE2_SHIFT_2; 1002 for(start=newStart; start<trie->dataLength;) { 1003 /* 1004 * start: index of first entry of current block 1005 * newStart: index where the current block is to be moved 1006 * (right after current end of already-compacted data) 1007 */ 1008 if(start==UNEWTRIE2_DATA_0800_OFFSET) { 1009 blockLength=UTRIE2_DATA_BLOCK_LENGTH; 1010 blockCount=1; 1011 } 1012 1013 /* skip blocks that are not used */ 1014 if(trie->map[start>>UTRIE2_SHIFT_2]<=0) { 1015 /* advance start to the next block */ 1016 start+=blockLength; 1017 1018 /* leave newStart with the previous block! */ 1019 continue; 1020 } 1021 1022 /* search for an identical block */ 1023 if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength)) 1024 >=0 1025 ) { 1026 /* found an identical block, set the other block's index value for the current block */ 1027 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1028 trie->map[mapIndex++]=movedStart; 1029 movedStart+=UTRIE2_DATA_BLOCK_LENGTH; 1030 } 1031 1032 /* advance start to the next block */ 1033 start+=blockLength; 1034 1035 /* leave newStart with the previous block! */ 1036 continue; 1037 } 1038 1039 /* see if the beginning of this block can be overlapped with the end of the previous block */ 1040 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ 1041 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY; 1042 overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap); 1043 overlap-=UTRIE2_DATA_GRANULARITY) {} 1044 1045 if(overlap>0 || newStart<start) { 1046 /* some overlap, or just move the whole block */ 1047 movedStart=newStart-overlap; 1048 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1049 trie->map[mapIndex++]=movedStart; 1050 movedStart+=UTRIE2_DATA_BLOCK_LENGTH; 1051 } 1052 1053 /* move the non-overlapping indexes to their new positions */ 1054 start+=overlap; 1055 for(i=blockLength-overlap; i>0; --i) { 1056 trie->data[newStart++]=trie->data[start++]; 1057 } 1058 } else /* no overlap && newStart==start */ { 1059 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1060 trie->map[mapIndex++]=start; 1061 start+=UTRIE2_DATA_BLOCK_LENGTH; 1062 } 1063 newStart=start; 1064 } 1065 } 1066 1067 /* now adjust the index-2 table */ 1068 for(i=0; i<trie->index2Length; ++i) { 1069 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) { 1070 /* Gap indexes are invalid (-1). Skip over the gap. */ 1071 i+=UNEWTRIE2_INDEX_GAP_LENGTH; 1072 } 1073 trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2]; 1074 } 1075 trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2]; 1076 1077 /* ensure dataLength alignment */ 1078 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) { 1079 trie->data[newStart++]=trie->initialValue; 1080 } 1081 1082#ifdef UTRIE2_DEBUG 1083 /* we saved some space */ 1084 printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n", 1085 (long)trie->dataLength, (long)newStart); 1086#endif 1087 1088 trie->dataLength=newStart; 1089} 1090 1091static void 1092compactIndex2(UNewTrie2 *trie) { 1093 int32_t i, start, newStart, movedStart, overlap; 1094 1095 /* do not compact linear-BMP index-2 blocks */ 1096 newStart=UTRIE2_INDEX_2_BMP_LENGTH; 1097 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) { 1098 trie->map[i]=start; 1099 } 1100 1101 /* Reduce the index table gap to what will be needed at runtime. */ 1102 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1); 1103 1104 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) { 1105 /* 1106 * start: index of first entry of current block 1107 * newStart: index where the current block is to be moved 1108 * (right after current end of already-compacted data) 1109 */ 1110 1111 /* search for an identical block */ 1112 if( (movedStart=findSameIndex2Block(trie->index2, newStart, start)) 1113 >=0 1114 ) { 1115 /* found an identical block, set the other block's index value for the current block */ 1116 trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart; 1117 1118 /* advance start to the next block */ 1119 start+=UTRIE2_INDEX_2_BLOCK_LENGTH; 1120 1121 /* leave newStart with the previous block! */ 1122 continue; 1123 } 1124 1125 /* see if the beginning of this block can be overlapped with the end of the previous block */ 1126 /* look for maximum overlap with the previous, adjacent block */ 1127 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1; 1128 overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap); 1129 --overlap) {} 1130 1131 if(overlap>0 || newStart<start) { 1132 /* some overlap, or just move the whole block */ 1133 trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap; 1134 1135 /* move the non-overlapping indexes to their new positions */ 1136 start+=overlap; 1137 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) { 1138 trie->index2[newStart++]=trie->index2[start++]; 1139 } 1140 } else /* no overlap && newStart==start */ { 1141 trie->map[start>>UTRIE2_SHIFT_1_2]=start; 1142 start+=UTRIE2_INDEX_2_BLOCK_LENGTH; 1143 newStart=start; 1144 } 1145 } 1146 1147 /* now adjust the index-1 table */ 1148 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { 1149 trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2]; 1150 } 1151 trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2]; 1152 1153 /* 1154 * Ensure data table alignment: 1155 * Needs to be granularity-aligned for 16-bit trie 1156 * (so that dataMove will be down-shiftable), 1157 * and 2-aligned for uint32_t data. 1158 */ 1159 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) { 1160 /* Arbitrary value: 0x3fffc not possible for real data. */ 1161 trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT; 1162 } 1163 1164#ifdef UTRIE2_DEBUG 1165 /* we saved some space */ 1166 printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n", 1167 (long)trie->index2Length, (long)newStart); 1168#endif 1169 1170 trie->index2Length=newStart; 1171} 1172 1173static void 1174compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { 1175 UNewTrie2 *newTrie; 1176 UChar32 highStart, suppHighStart; 1177 uint32_t highValue; 1178 1179 newTrie=trie->newTrie; 1180 1181 /* find highStart and round it up */ 1182 highValue=utrie2_get32(trie, 0x10ffff); 1183 highStart=findHighStart(newTrie, highValue); 1184 highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1); 1185 if(highStart==0x110000) { 1186 highValue=trie->errorValue; 1187 } 1188 1189 /* 1190 * Set trie->highStart only after utrie2_get32(trie, highStart). 1191 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue. 1192 */ 1193 trie->highStart=newTrie->highStart=highStart; 1194 1195#ifdef UTRIE2_DEBUG 1196 printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n", 1197 (long)highStart, (long)highValue, (long)trie->initialValue); 1198#endif 1199 1200 if(highStart<0x110000) { 1201 /* Blank out [highStart..10ffff] to release associated data blocks. */ 1202 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart; 1203 utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode); 1204 if(U_FAILURE(*pErrorCode)) { 1205 return; 1206 } 1207 } 1208 1209 compactData(newTrie); 1210 if(highStart>0x10000) { 1211 compactIndex2(newTrie); 1212#ifdef UTRIE2_DEBUG 1213 } else { 1214 printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n", 1215 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET); 1216#endif 1217 } 1218 1219 /* 1220 * Store the highValue in the data array and round up the dataLength. 1221 * Must be done after compactData() because that assumes that dataLength 1222 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH. 1223 */ 1224 newTrie->data[newTrie->dataLength++]=highValue; 1225 while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) { 1226 newTrie->data[newTrie->dataLength++]=trie->initialValue; 1227 } 1228 1229 newTrie->isCompacted=TRUE; 1230} 1231 1232/* serialization ------------------------------------------------------------ */ 1233 1234/** 1235 * Maximum length of the runtime index array. 1236 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength. 1237 * (The actual maximum length is lower, 1238 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.) 1239 */ 1240#define UTRIE2_MAX_INDEX_LENGTH 0xffff 1241 1242/** 1243 * Maximum length of the runtime data array. 1244 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, 1245 * and by uint16_t UTrie2Header.shiftedDataLength. 1246 */ 1247#define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT) 1248 1249/* Compact and internally serialize the trie. */ 1250U_CAPI void U_EXPORT2 1251utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) { 1252 UNewTrie2 *newTrie; 1253 UTrie2Header *header; 1254 uint32_t *p; 1255 uint16_t *dest16; 1256 int32_t i, length; 1257 int32_t allIndexesLength; 1258 int32_t dataMove; /* >0 if the data is moved to the end of the index array */ 1259 UChar32 highStart; 1260 1261 /* argument check */ 1262 if(U_FAILURE(*pErrorCode)) { 1263 return; 1264 } 1265 if( trie==NULL || 1266 valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits 1267 ) { 1268 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1269 return; 1270 } 1271 newTrie=trie->newTrie; 1272 if(newTrie==NULL) { 1273 /* already frozen */ 1274 UTrie2ValueBits frozenValueBits= 1275 trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS; 1276 if(valueBits!=frozenValueBits) { 1277 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1278 } 1279 return; 1280 } 1281 1282 /* compact if necessary */ 1283 if(!newTrie->isCompacted) { 1284 compactTrie(trie, pErrorCode); 1285 if(U_FAILURE(*pErrorCode)) { 1286 return; 1287 } 1288 } 1289 highStart=trie->highStart; 1290 1291 if(highStart<=0x10000) { 1292 allIndexesLength=UTRIE2_INDEX_1_OFFSET; 1293 } else { 1294 allIndexesLength=newTrie->index2Length; 1295 } 1296 if(valueBits==UTRIE2_16_VALUE_BITS) { 1297 dataMove=allIndexesLength; 1298 } else { 1299 dataMove=0; 1300 } 1301 1302 /* are indexLength and dataLength within limits? */ 1303 if( /* for unshifted indexLength */ 1304 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH || 1305 /* for unshifted dataNullOffset */ 1306 (dataMove+newTrie->dataNullOffset)>0xffff || 1307 /* for unshifted 2-byte UTF-8 index-2 values */ 1308 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff || 1309 /* for shiftedDataLength */ 1310 (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH 1311 ) { 1312 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1313 return; 1314 } 1315 1316 /* calculate the total serialized length */ 1317 length=sizeof(UTrie2Header)+allIndexesLength*2; 1318 if(valueBits==UTRIE2_16_VALUE_BITS) { 1319 length+=newTrie->dataLength*2; 1320 } else { 1321 length+=newTrie->dataLength*4; 1322 } 1323 1324 trie->memory=uprv_malloc(length); 1325 if(trie->memory==NULL) { 1326 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1327 return; 1328 } 1329 trie->length=length; 1330 trie->isMemoryOwned=TRUE; 1331 1332 trie->indexLength=allIndexesLength; 1333 trie->dataLength=newTrie->dataLength; 1334 if(highStart<=0x10000) { 1335 trie->index2NullOffset=0xffff; 1336 } else { 1337 trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset; 1338 } 1339 trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset); 1340 trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY; 1341 1342 /* set the header fields */ 1343 header=(UTrie2Header *)trie->memory; 1344 1345 header->signature=UTRIE2_SIG; /* "Tri2" */ 1346 header->options=(uint16_t)valueBits; 1347 1348 header->indexLength=(uint16_t)trie->indexLength; 1349 header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT); 1350 header->index2NullOffset=trie->index2NullOffset; 1351 header->dataNullOffset=trie->dataNullOffset; 1352 header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1); 1353 1354 /* fill the index and data arrays */ 1355 dest16=(uint16_t *)(header+1); 1356 trie->index=dest16; 1357 1358 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */ 1359 p=(uint32_t *)newTrie->index2; 1360 for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) { 1361 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); 1362 } 1363 1364 /* write UTF-8 2-byte index-2 values, not right-shifted */ 1365 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ 1366 *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); 1367 } 1368 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ 1369 *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]); 1370 } 1371 1372 if(highStart>0x10000) { 1373 int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1; 1374 int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length; 1375 1376 /* write 16-bit index-1 values for supplementary code points */ 1377 p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; 1378 for(i=index1Length; i>0; --i) { 1379 *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++); 1380 } 1381 1382 /* 1383 * write the index-2 array values for supplementary code points, 1384 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove 1385 */ 1386 p=(uint32_t *)newTrie->index2+index2Offset; 1387 for(i=newTrie->index2Length-index2Offset; i>0; --i) { 1388 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); 1389 } 1390 } 1391 1392 /* write the 16/32-bit data array */ 1393 switch(valueBits) { 1394 case UTRIE2_16_VALUE_BITS: 1395 /* write 16-bit data values */ 1396 trie->data16=dest16; 1397 trie->data32=NULL; 1398 p=newTrie->data; 1399 for(i=newTrie->dataLength; i>0; --i) { 1400 *dest16++=(uint16_t)*p++; 1401 } 1402 break; 1403 case UTRIE2_32_VALUE_BITS: 1404 /* write 32-bit data values */ 1405 trie->data16=NULL; 1406 trie->data32=(uint32_t *)dest16; 1407 uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4); 1408 break; 1409 default: 1410 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1411 return; 1412 } 1413 1414 /* Delete the UNewTrie2. */ 1415 uprv_free(newTrie->data); 1416 uprv_free(newTrie); 1417 trie->newTrie=NULL; 1418} 1419 1420U_CAPI UBool U_EXPORT2 1421utrie2_isFrozen(const UTrie2 *trie) { 1422 return (UBool)(trie->newTrie==NULL); 1423} 1424 1425U_CAPI int32_t U_EXPORT2 1426utrie2_serialize(UTrie2 *trie, 1427 void *data, int32_t capacity, 1428 UErrorCode *pErrorCode) { 1429 /* argument check */ 1430 if(U_FAILURE(*pErrorCode)) { 1431 return 0; 1432 } 1433 1434 if( trie==NULL || trie->memory==NULL || trie->newTrie!=NULL || 1435 capacity<0 || (capacity>0 && (data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0))) 1436 ) { 1437 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1438 return 0; 1439 } 1440 1441 if(capacity>=trie->length) { 1442 uprv_memcpy(data, trie->memory, trie->length); 1443 } else { 1444 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1445 } 1446 return trie->length; 1447} 1448 1449/* 1450 * This is here to avoid a dependency from utrie2.cpp on utrie.c. 1451 * This file already depends on utrie.c. 1452 * Otherwise, this should be in utrie2.cpp right after utrie2_swap(). 1453 */ 1454U_CAPI int32_t U_EXPORT2 1455utrie2_swapAnyVersion(const UDataSwapper *ds, 1456 const void *inData, int32_t length, void *outData, 1457 UErrorCode *pErrorCode) { 1458 if(U_SUCCESS(*pErrorCode)) { 1459 switch(utrie2_getVersion(inData, length, TRUE)) { 1460 case 1: 1461 return utrie_swap(ds, inData, length, outData, pErrorCode); 1462 case 2: 1463 return utrie2_swap(ds, inData, length, outData, pErrorCode); 1464 default: 1465 *pErrorCode=U_INVALID_FORMAT_ERROR; 1466 return 0; 1467 } 1468 } 1469 return 0; 1470} 1471