propsvec.c revision b0ac937921a2c196d8b9da665135bf6ba01a1ccf
1/* 2******************************************************************************* 3* 4* Copyright (C) 2002-2009, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: propsvec.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2002feb22 14* created by: Markus W. Scherer 15* 16* Store bits (Unicode character properties) in bit set vectors. 17*/ 18 19#include <stdlib.h> 20#include "unicode/utypes.h" 21#include "cmemory.h" 22#include "utrie.h" 23#include "utrie2.h" 24#include "uarrsort.h" 25#include "propsvec.h" 26 27struct UPropsVectors { 28 uint32_t *v; 29 int32_t columns; /* number of columns, plus two for start & limit values */ 30 int32_t maxRows; 31 int32_t rows; 32 int32_t prevRow; /* search optimization: remember last row seen */ 33 UBool isCompacted; 34}; 35 36#define UPVEC_INITIAL_ROWS (1<<12) 37#define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) 38#define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) 39 40U_CAPI UPropsVectors * U_EXPORT2 41upvec_open(int32_t columns, UErrorCode *pErrorCode) { 42 UPropsVectors *pv; 43 uint32_t *v, *row; 44 uint32_t cp; 45 46 if(U_FAILURE(*pErrorCode)) { 47 return NULL; 48 } 49 if(columns<1) { 50 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 51 return NULL; 52 } 53 columns+=2; /* count range start and limit columns */ 54 55 pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); 56 v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); 57 if(pv==NULL || v==NULL) { 58 uprv_free(pv); 59 uprv_free(v); 60 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 61 return NULL; 62 } 63 uprv_memset(pv, 0, sizeof(UPropsVectors)); 64 pv->v=v; 65 pv->columns=columns; 66 pv->maxRows=UPVEC_INITIAL_ROWS; 67 pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); 68 69 /* set the all-Unicode row and the special-value rows */ 70 row=pv->v; 71 uprv_memset(row, 0, pv->rows*columns*4); 72 row[0]=0; 73 row[1]=0x110000; 74 row+=columns; 75 for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { 76 row[0]=cp; 77 row[1]=cp+1; 78 row+=columns; 79 } 80 return pv; 81} 82 83U_CAPI void U_EXPORT2 84upvec_close(UPropsVectors *pv) { 85 if(pv!=NULL) { 86 uprv_free(pv->v); 87 uprv_free(pv); 88 } 89} 90 91static uint32_t * 92_findRow(UPropsVectors *pv, UChar32 rangeStart) { 93 uint32_t *row; 94 int32_t columns, i, start, limit, prevRow, rows; 95 96 columns=pv->columns; 97 rows=limit=pv->rows; 98 prevRow=pv->prevRow; 99 100 /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ 101 row=pv->v+prevRow*columns; 102 if(rangeStart>=(UChar32)row[0]) { 103 if(rangeStart<(UChar32)row[1]) { 104 /* same row as last seen */ 105 return row; 106 } else if(rangeStart<(UChar32)(row+=columns)[1]) { 107 /* next row after the last one */ 108 pv->prevRow=prevRow+1; 109 return row; 110 } else if(rangeStart<(UChar32)(row+=columns)[1]) { 111 /* second row after the last one */ 112 pv->prevRow=prevRow+2; 113 return row; 114 } else if((rangeStart-(UChar32)row[1])<10) { 115 /* we are close, continue looping */ 116 prevRow+=2; 117 do { 118 ++prevRow; 119 row+=columns; 120 } while(rangeStart>=(UChar32)row[1]); 121 pv->prevRow=prevRow; 122 return row; 123 } 124 } else if(rangeStart<(UChar32)pv->v[1]) { 125 /* the very first row */ 126 pv->prevRow=0; 127 return pv->v; 128 } 129 130 /* do a binary search for the start of the range */ 131 start=0; 132 while(start<limit-1) { 133 i=(start+limit)/2; 134 row=pv->v+i*columns; 135 if(rangeStart<(UChar32)row[0]) { 136 limit=i; 137 } else if(rangeStart<(UChar32)row[1]) { 138 pv->prevRow=i; 139 return row; 140 } else { 141 start=i; 142 } 143 } 144 145 /* must be found because all ranges together always cover all of Unicode */ 146 pv->prevRow=start; 147 return pv->v+start*columns; 148} 149 150U_CAPI void U_EXPORT2 151upvec_setValue(UPropsVectors *pv, 152 UChar32 start, UChar32 end, 153 int32_t column, 154 uint32_t value, uint32_t mask, 155 UErrorCode *pErrorCode) { 156 uint32_t *firstRow, *lastRow; 157 int32_t columns; 158 UChar32 limit; 159 UBool splitFirstRow, splitLastRow; 160 161 /* argument checking */ 162 if(U_FAILURE(*pErrorCode)) { 163 return; 164 } 165 if( pv==NULL || 166 start<0 || start>end || end>UPVEC_MAX_CP || 167 column<0 || column>=(pv->columns-2) 168 ) { 169 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 170 return; 171 } 172 if(pv->isCompacted) { 173 *pErrorCode=U_NO_WRITE_PERMISSION; 174 return; 175 } 176 limit=end+1; 177 178 /* initialize */ 179 columns=pv->columns; 180 column+=2; /* skip range start and limit columns */ 181 value&=mask; 182 183 /* find the rows whose ranges overlap with the input range */ 184 185 /* find the first and last rows, always successful */ 186 firstRow=_findRow(pv, start); 187 lastRow=_findRow(pv, end); 188 189 /* 190 * Rows need to be split if they partially overlap with the 191 * input range (only possible for the first and last rows) 192 * and if their value differs from the input value. 193 */ 194 splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); 195 splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); 196 197 /* split first/last rows if necessary */ 198 if(splitFirstRow || splitLastRow) { 199 int32_t count, rows; 200 201 rows=pv->rows; 202 if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { 203 uint32_t *newVectors; 204 int32_t newMaxRows; 205 206 if(pv->maxRows<UPVEC_MEDIUM_ROWS) { 207 newMaxRows=UPVEC_MEDIUM_ROWS; 208 } else if(pv->maxRows<UPVEC_MAX_ROWS) { 209 newMaxRows=UPVEC_MAX_ROWS; 210 } else { 211 /* Implementation bug, or UPVEC_MAX_ROWS too low. */ 212 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 213 return; 214 } 215 newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4); 216 if(newVectors==NULL) { 217 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 218 return; 219 } 220 uprv_memcpy(newVectors, pv->v, rows*columns*4); 221 firstRow=newVectors+(firstRow-pv->v); 222 lastRow=newVectors+(lastRow-pv->v); 223 uprv_free(pv->v); 224 pv->v=newVectors; 225 pv->maxRows=newMaxRows; 226 } 227 228 /* count the number of row cells to move after the last row, and move them */ 229 count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); 230 if(count>0) { 231 uprv_memmove( 232 lastRow+(1+splitFirstRow+splitLastRow)*columns, 233 lastRow+columns, 234 count*4); 235 } 236 pv->rows=rows+splitFirstRow+splitLastRow; 237 238 /* split the first row, and move the firstRow pointer to the second part */ 239 if(splitFirstRow) { 240 /* copy all affected rows up one and move the lastRow pointer */ 241 count = (int32_t)((lastRow-firstRow)+columns); 242 uprv_memmove(firstRow+columns, firstRow, count*4); 243 lastRow+=columns; 244 245 /* split the range and move the firstRow pointer */ 246 firstRow[1]=firstRow[columns]=(uint32_t)start; 247 firstRow+=columns; 248 } 249 250 /* split the last row */ 251 if(splitLastRow) { 252 /* copy the last row data */ 253 uprv_memcpy(lastRow+columns, lastRow, columns*4); 254 255 /* split the range and move the firstRow pointer */ 256 lastRow[1]=lastRow[columns]=(uint32_t)limit; 257 } 258 } 259 260 /* set the "row last seen" to the last row for the range */ 261 pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); 262 263 /* set the input value in all remaining rows */ 264 firstRow+=column; 265 lastRow+=column; 266 mask=~mask; 267 for(;;) { 268 *firstRow=(*firstRow&mask)|value; 269 if(firstRow==lastRow) { 270 break; 271 } 272 firstRow+=columns; 273 } 274} 275 276U_CAPI uint32_t U_EXPORT2 277upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { 278 uint32_t *row; 279 280 if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { 281 return 0; 282 } 283 row=_findRow((UPropsVectors *)pv, c); 284 return row[2+column]; 285} 286 287U_CAPI uint32_t * U_EXPORT2 288upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, 289 UChar32 *pRangeStart, UChar32 *pRangeEnd) { 290 uint32_t *row; 291 int32_t columns; 292 293 if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { 294 return NULL; 295 } 296 297 columns=pv->columns; 298 row=pv->v+rowIndex*columns; 299 if(pRangeStart!=NULL) { 300 *pRangeStart=(UChar32)row[0]; 301 } 302 if(pRangeEnd!=NULL) { 303 *pRangeEnd=(UChar32)row[1]-1; 304 } 305 return row+2; 306} 307 308static int32_t U_CALLCONV 309upvec_compareRows(const void *context, const void *l, const void *r) { 310 const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; 311 const UPropsVectors *pv=(const UPropsVectors *)context; 312 int32_t i, count, columns; 313 314 count=columns=pv->columns; /* includes start/limit columns */ 315 316 /* start comparing after start/limit but wrap around to them */ 317 i=2; 318 do { 319 if(left[i]!=right[i]) { 320 return left[i]<right[i] ? -1 : 1; 321 } 322 if(++i==columns) { 323 i=0; 324 } 325 } while(--count>0); 326 327 return 0; 328} 329 330U_CAPI void U_EXPORT2 331upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { 332 uint32_t *row; 333 int32_t i, columns, valueColumns, rows, count; 334 UChar32 start, limit; 335 336 /* argument checking */ 337 if(U_FAILURE(*pErrorCode)) { 338 return; 339 } 340 if(handler==NULL) { 341 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 342 return; 343 } 344 if(pv->isCompacted) { 345 return; 346 } 347 348 /* Set the flag now: Sorting and compacting destroys the builder data structure. */ 349 pv->isCompacted=TRUE; 350 351 rows=pv->rows; 352 columns=pv->columns; 353 valueColumns=columns-2; /* not counting start & limit */ 354 355 /* sort the properties vectors to find unique vector values */ 356 uprv_sortArray(pv->v, rows, columns*4, 357 upvec_compareRows, pv, FALSE, pErrorCode); 358 if(U_FAILURE(*pErrorCode)) { 359 return; 360 } 361 362 /* 363 * Find and set the special values. 364 * This has to do almost the same work as the compaction below, 365 * to find the indexes where the special-value rows will move. 366 */ 367 row=pv->v; 368 count=-valueColumns; 369 for(i=0; i<rows; ++i) { 370 start=(UChar32)row[0]; 371 372 /* count a new values vector if it is different from the current one */ 373 if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) { 374 count+=valueColumns; 375 } 376 377 if(start>=UPVEC_FIRST_SPECIAL_CP) { 378 handler(context, start, start, count, row+2, valueColumns, pErrorCode); 379 if(U_FAILURE(*pErrorCode)) { 380 return; 381 } 382 } 383 384 row+=columns; 385 } 386 387 /* count is at the beginning of the last vector, add valueColumns to include that last vector */ 388 count+=valueColumns; 389 390 /* Call the handler once more to signal the start of delivering real values. */ 391 handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, 392 count, row-valueColumns, valueColumns, pErrorCode); 393 if(U_FAILURE(*pErrorCode)) { 394 return; 395 } 396 397 /* 398 * Move vector contents up to a contiguous array with only unique 399 * vector values, and call the handler function for each vector. 400 * 401 * This destroys the Properties Vector structure and replaces it 402 * with an array of just vector values. 403 */ 404 row=pv->v; 405 count=-valueColumns; 406 for(i=0; i<rows; ++i) { 407 /* fetch these first before memmove() may overwrite them */ 408 start=(UChar32)row[0]; 409 limit=(UChar32)row[1]; 410 411 /* add a new values vector if it is different from the current one */ 412 if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) { 413 count+=valueColumns; 414 uprv_memmove(pv->v+count, row+2, valueColumns*4); 415 } 416 417 if(start<UPVEC_FIRST_SPECIAL_CP) { 418 handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode); 419 if(U_FAILURE(*pErrorCode)) { 420 return; 421 } 422 } 423 424 row+=columns; 425 } 426 427 /* count is at the beginning of the last vector, add one to include that last vector */ 428 pv->rows=count/valueColumns+1; 429} 430 431U_CAPI const uint32_t * U_EXPORT2 432upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { 433 if(!pv->isCompacted) { 434 return NULL; 435 } 436 if(pRows!=NULL) { 437 *pRows=pv->rows; 438 } 439 if(pColumns!=NULL) { 440 *pColumns=pv->columns-2; 441 } 442 return pv->v; 443} 444 445U_CAPI uint32_t * U_EXPORT2 446upvec_cloneArray(const UPropsVectors *pv, 447 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { 448 uint32_t *clonedArray; 449 int32_t byteLength; 450 451 if(U_FAILURE(*pErrorCode)) { 452 return NULL; 453 } 454 if(!pv->isCompacted) { 455 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 456 return NULL; 457 } 458 byteLength=pv->rows*(pv->columns-2)*4; 459 clonedArray=(uint32_t *)uprv_malloc(byteLength); 460 if(clonedArray==NULL) { 461 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 462 return NULL; 463 } 464 uprv_memcpy(clonedArray, pv->v, byteLength); 465 if(pRows!=NULL) { 466 *pRows=pv->rows; 467 } 468 if(pColumns!=NULL) { 469 *pColumns=pv->columns-2; 470 } 471 return clonedArray; 472} 473 474U_CAPI UTrie2 * U_EXPORT2 475upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { 476 UPVecToUTrie2Context toUTrie2={ NULL }; 477 upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); 478 utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); 479 if(U_FAILURE(*pErrorCode)) { 480 utrie2_close(toUTrie2.trie); 481 toUTrie2.trie=NULL; 482 } 483 return toUTrie2.trie; 484} 485 486/* 487 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts 488 * some 16-bit field and builds and returns a UTrie2. 489 */ 490 491U_CAPI void U_CALLCONV 492upvec_compactToUTrieHandler(void *context, 493 UChar32 start, UChar32 end, 494 int32_t rowIndex, uint32_t *row, int32_t columns, 495 UErrorCode *pErrorCode) { 496 UPVecToUTrieContext *toUTrie=(UPVecToUTrieContext *)context; 497 if(start<UPVEC_FIRST_SPECIAL_CP) { 498 if(!utrie_setRange32(toUTrie->newTrie, start, end+1, (uint32_t)rowIndex, TRUE)) { 499 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 500 } 501 } else { 502 switch(start) { 503 case UPVEC_INITIAL_VALUE_CP: 504 toUTrie->initialValue=rowIndex; 505 break; 506 case UPVEC_START_REAL_VALUES_CP: 507 if(rowIndex>0xffff) { 508 /* too many rows for a 16-bit trie */ 509 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 510 } else { 511 toUTrie->newTrie=utrie_open(NULL, NULL, toUTrie->capacity, 512 toUTrie->initialValue, toUTrie->initialValue, 513 toUTrie->latin1Linear); 514 if(toUTrie->newTrie==NULL) { 515 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 516 } 517 } 518 break; 519 default: 520 break; 521 } 522 } 523} 524 525U_CAPI void U_CALLCONV 526upvec_compactToUTrie2Handler(void *context, 527 UChar32 start, UChar32 end, 528 int32_t rowIndex, uint32_t *row, int32_t columns, 529 UErrorCode *pErrorCode) { 530 UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; 531 if(start<UPVEC_FIRST_SPECIAL_CP) { 532 utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); 533 } else { 534 switch(start) { 535 case UPVEC_INITIAL_VALUE_CP: 536 toUTrie2->initialValue=rowIndex; 537 break; 538 case UPVEC_ERROR_VALUE_CP: 539 toUTrie2->errorValue=rowIndex; 540 break; 541 case UPVEC_START_REAL_VALUES_CP: 542 toUTrie2->maxValue=rowIndex; 543 if(rowIndex>0xffff) { 544 /* too many rows for a 16-bit trie */ 545 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 546 } else { 547 toUTrie2->trie=utrie2_open(toUTrie2->initialValue, 548 toUTrie2->errorValue, pErrorCode); 549 } 550 break; 551 default: 552 break; 553 } 554 } 555} 556