1/* 2******************************************************************************* 3* 4* Copyright (C) 1999-2011, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: ucol_wgt.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2001mar08 14* created by: Markus W. Scherer 15* 16* This file contains code for allocating n collation element weights 17* between two exclusive limits. 18* It is used only internally by ucol_bld. 19*/ 20 21#include "unicode/utypes.h" 22 23#if !UCONFIG_NO_COLLATION 24 25#include "ucol_imp.h" 26#include "ucol_wgt.h" 27#include "cmemory.h" 28#include "uarrsort.h" 29 30#ifdef UCOL_DEBUG 31# include <stdio.h> 32#endif 33 34/* collation element weight allocation -------------------------------------- */ 35 36/* helper functions for CE weights */ 37 38static inline int32_t 39lengthOfWeight(uint32_t weight) { 40 if((weight&0xffffff)==0) { 41 return 1; 42 } else if((weight&0xffff)==0) { 43 return 2; 44 } else if((weight&0xff)==0) { 45 return 3; 46 } else { 47 return 4; 48 } 49} 50 51static inline uint32_t 52getWeightTrail(uint32_t weight, int32_t length) { 53 return (uint32_t)(weight>>(8*(4-length)))&0xff; 54} 55 56static inline uint32_t 57setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) { 58 length=8*(4-length); 59 return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length)); 60} 61 62static inline uint32_t 63getWeightByte(uint32_t weight, int32_t idx) { 64 return getWeightTrail(weight, idx); /* same calculation */ 65} 66 67static inline uint32_t 68setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) { 69 uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */ 70 71 idx*=8; 72 if(idx<32) { 73 mask=((uint32_t)0xffffffff)>>idx; 74 } else { 75 // Do not use uint32_t>>32 because on some platforms that does not shift at all 76 // while we need it to become 0. 77 // PowerPC: 0xffffffff>>32 = 0 (wanted) 78 // x86: 0xffffffff>>32 = 0xffffffff (not wanted) 79 // 80 // ANSI C99 6.5.7 Bitwise shift operators: 81 // "If the value of the right operand is negative 82 // or is greater than or equal to the width of the promoted left operand, 83 // the behavior is undefined." 84 mask=0; 85 } 86 idx=32-idx; 87 mask|=0xffffff00<<idx; 88 return (uint32_t)((weight&mask)|(byte<<idx)); 89} 90 91static inline uint32_t 92truncateWeight(uint32_t weight, int32_t length) { 93 return (uint32_t)(weight&(0xffffffff<<(8*(4-length)))); 94} 95 96static inline uint32_t 97incWeightTrail(uint32_t weight, int32_t length) { 98 return (uint32_t)(weight+(1UL<<(8*(4-length)))); 99} 100 101static inline uint32_t 102decWeightTrail(uint32_t weight, int32_t length) { 103 return (uint32_t)(weight-(1UL<<(8*(4-length)))); 104} 105 106static inline uint32_t 107incWeight(uint32_t weight, int32_t length, uint32_t maxByte) { 108 uint32_t byte; 109 110 for(;;) { 111 byte=getWeightByte(weight, length); 112 if(byte<maxByte) { 113 return setWeightByte(weight, length, byte+1); 114 } else { 115 /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */ 116 weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED); 117 --length; 118 } 119 } 120} 121 122static inline int32_t 123lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) { 124 int32_t length; 125 126 length=range->length2+1; 127 range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED); 128 range->end=setWeightTrail(range->end, length, maxByte); 129 range->count2*=countBytes; 130 range->length2=length; 131 return length; 132} 133 134/* for uprv_sortArray: sort ranges in weight order */ 135static int32_t U_CALLCONV 136compareRanges(const void * /*context*/, const void *left, const void *right) { 137 uint32_t l, r; 138 139 l=((const WeightRange *)left)->start; 140 r=((const WeightRange *)right)->start; 141 if(l<r) { 142 return -1; 143 } else if(l>r) { 144 return 1; 145 } else { 146 return 0; 147 } 148} 149 150/* 151 * take two CE weights and calculate the 152 * possible ranges of weights between the two limits, excluding them 153 * for weights with up to 4 bytes there are up to 2*4-1=7 ranges 154 */ 155static inline int32_t 156getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit, 157 uint32_t maxByte, uint32_t countBytes, 158 WeightRange ranges[7]) { 159 WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */ 160 uint32_t weight, trail; 161 int32_t length, lowerLength, upperLength, rangeCount; 162 163 /* assume that both lowerLimit & upperLimit are not 0 */ 164 165 /* get the lengths of the limits */ 166 lowerLength=lengthOfWeight(lowerLimit); 167 upperLength=lengthOfWeight(upperLimit); 168 169#ifdef UCOL_DEBUG 170 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength); 171 printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength); 172#endif 173 174 if(lowerLimit>=upperLimit) { 175#ifdef UCOL_DEBUG 176 printf("error: no space between lower & upper limits\n"); 177#endif 178 return 0; 179 } 180 181 /* check that neither is a prefix of the other */ 182 if(lowerLength<upperLength) { 183 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) { 184#ifdef UCOL_DEBUG 185 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit); 186#endif 187 return 0; 188 } 189 } 190 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */ 191 192 /* reset local variables */ 193 uprv_memset(lower, 0, sizeof(lower)); 194 uprv_memset(&middle, 0, sizeof(middle)); 195 uprv_memset(upper, 0, sizeof(upper)); 196 197 /* 198 * With the limit lengths of 1..4, there are up to 7 ranges for allocation: 199 * range minimum length 200 * lower[4] 4 201 * lower[3] 3 202 * lower[2] 2 203 * middle 1 204 * upper[2] 2 205 * upper[3] 3 206 * upper[4] 4 207 * 208 * We are now going to calculate up to 7 ranges. 209 * Some of them will typically overlap, so we will then have to merge and eliminate ranges. 210 */ 211 weight=lowerLimit; 212 for(length=lowerLength; length>=2; --length) { 213 trail=getWeightTrail(weight, length); 214 if(trail<maxByte) { 215 lower[length].start=incWeightTrail(weight, length); 216 lower[length].end=setWeightTrail(weight, length, maxByte); 217 lower[length].length=length; 218 lower[length].count=maxByte-trail; 219 } 220 weight=truncateWeight(weight, length-1); 221 } 222 middle.start=incWeightTrail(weight, 1); 223 224 weight=upperLimit; 225 for(length=upperLength; length>=2; --length) { 226 trail=getWeightTrail(weight, length); 227 if(trail>UCOL_BYTE_FIRST_TAILORED) { 228 upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED); 229 upper[length].end=decWeightTrail(weight, length); 230 upper[length].length=length; 231 upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED; 232 } 233 weight=truncateWeight(weight, length-1); 234 } 235 middle.end=decWeightTrail(weight, 1); 236 237 /* set the middle range */ 238 middle.length=1; 239 if(middle.end>=middle.start) { 240 middle.count=(int32_t)((middle.end-middle.start)>>24)+1; 241 } else { 242 /* eliminate overlaps */ 243 uint32_t start, end; 244 245 /* remove the middle range */ 246 middle.count=0; 247 248 /* reduce or remove the lower ranges that go beyond upperLimit */ 249 for(length=4; length>=2; --length) { 250 if(lower[length].count>0 && upper[length].count>0) { 251 start=upper[length].start; 252 end=lower[length].end; 253 254 if(end>=start || incWeight(end, length, maxByte)==start) { 255 /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */ 256 start=lower[length].start; 257 end=lower[length].end=upper[length].end; 258 /* 259 * merging directly adjacent ranges needs to subtract the 0/1 gaps in between; 260 * it may result in a range with count>countBytes 261 */ 262 lower[length].count= 263 (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+ 264 countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1))); 265 upper[length].count=0; 266 while(--length>=2) { 267 lower[length].count=upper[length].count=0; 268 } 269 break; 270 } 271 } 272 } 273 } 274 275#ifdef UCOL_DEBUG 276 /* print ranges */ 277 for(length=4; length>=2; --length) { 278 if(lower[length].count>0) { 279 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count); 280 } 281 } 282 if(middle.count>0) { 283 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count); 284 } 285 for(length=2; length<=4; ++length) { 286 if(upper[length].count>0) { 287 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count); 288 } 289 } 290#endif 291 292 /* copy the ranges, shortest first, into the result array */ 293 rangeCount=0; 294 if(middle.count>0) { 295 uprv_memcpy(ranges, &middle, sizeof(WeightRange)); 296 rangeCount=1; 297 } 298 for(length=2; length<=4; ++length) { 299 /* copy upper first so that later the middle range is more likely the first one to use */ 300 if(upper[length].count>0) { 301 uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange)); 302 ++rangeCount; 303 } 304 if(lower[length].count>0) { 305 uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange)); 306 ++rangeCount; 307 } 308 } 309 return rangeCount; 310} 311 312/* 313 * call getWeightRanges and then determine heuristically 314 * which ranges to use for a given number of weights between (excluding) 315 * two limits 316 */ 317U_CFUNC int32_t 318ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit, 319 uint32_t n, 320 uint32_t maxByte, 321 WeightRange ranges[7]) { 322 /* number of usable byte values 3..maxByte */ 323 uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1; 324 325 uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */ 326 uint32_t maxCount; 327 int32_t i, rangeCount, minLength/*, maxLength*/; 328 329 /* countBytes to the power of index */ 330 uint32_t powers[5]; 331 /* gcc requires explicit initialization */ 332 powers[0] = 1; 333 powers[1] = countBytes; 334 powers[2] = countBytes*countBytes; 335 powers[3] = countBytes*countBytes*countBytes; 336 powers[4] = countBytes*countBytes*countBytes*countBytes; 337 338#ifdef UCOL_DEBUG 339 puts(""); 340#endif 341 342 rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges); 343 if(rangeCount<=0) { 344#ifdef UCOL_DEBUG 345 printf("error: unable to get Weight ranges\n"); 346#endif 347 return 0; 348 } 349 350 /* what is the maximum number of weights with these ranges? */ 351 maxCount=0; 352 for(i=0; i<rangeCount; ++i) { 353 maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length]; 354 } 355 if(maxCount>=n) { 356#ifdef UCOL_DEBUG 357 printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n); 358#endif 359 } else { 360#ifdef UCOL_DEBUG 361 printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n); 362#endif 363 return 0; 364 } 365 366 /* set the length2 and count2 fields */ 367 for(i=0; i<rangeCount; ++i) { 368 ranges[i].length2=ranges[i].length; 369 ranges[i].count2=(uint32_t)ranges[i].count; 370 } 371 372 /* try until we find suitably large ranges */ 373 for(;;) { 374 /* get the smallest number of bytes in a range */ 375 minLength=ranges[0].length2; 376 377 /* sum up the number of elements that fit into ranges of each byte length */ 378 uprv_memset(lengthCounts, 0, sizeof(lengthCounts)); 379 for(i=0; i<rangeCount; ++i) { 380 lengthCounts[ranges[i].length2]+=ranges[i].count2; 381 } 382 383 /* now try to allocate n elements in the available short ranges */ 384 if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) { 385 /* trivial cases, use the first few ranges */ 386 maxCount=0; 387 rangeCount=0; 388 do { 389 maxCount+=ranges[rangeCount].count2; 390 ++rangeCount; 391 } while(n>maxCount); 392#ifdef UCOL_DEBUG 393 printf("take first %ld ranges\n", rangeCount); 394#endif 395 break; 396 } else if(n<=ranges[0].count2*countBytes) { 397 /* easy case, just make this one range large enough by lengthening it once more, possibly split it */ 398 uint32_t count1, count2, power_1, power; 399 400 /*maxLength=minLength+1;*/ 401 402 /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */ 403 power_1=powers[minLength-ranges[0].length]; 404 power=power_1*countBytes; 405 count2=(n+power-1)/power; 406 count1=ranges[0].count-count2; 407 408 /* split the range */ 409#ifdef UCOL_DEBUG 410 printf("split the first range %ld:%ld\n", count1, count2); 411#endif 412 if(count1<1) { 413 rangeCount=1; 414 415 /* lengthen the entire range to maxLength */ 416 lengthenRange(ranges, maxByte, countBytes); 417 } else { 418 /* really split the range */ 419 uint32_t byte; 420 421 /* create a new range with the end and initial and current length of the old one */ 422 rangeCount=2; 423 ranges[1].end=ranges[0].end; 424 ranges[1].length=ranges[0].length; 425 ranges[1].length2=minLength; 426 427 /* set the end of the first range according to count1 */ 428 i=ranges[0].length; 429 byte=getWeightByte(ranges[0].start, i)+count1-1; 430 431 /* 432 * ranges[0].count and count1 may be >countBytes 433 * from merging adjacent ranges; 434 * byte>maxByte is possible 435 */ 436 if(byte<=maxByte) { 437 ranges[0].end=setWeightByte(ranges[0].start, i, byte); 438 } else /* byte>maxByte */ { 439 ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes); 440 } 441 442 /* set the bytes in the end weight at length+1..length2 to maxByte */ 443 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */ 444 ranges[0].end=truncateWeight(ranges[0].end, i)| 445 ((byte>>(8*i))&(byte<<(8*(4-minLength)))); 446 447 /* set the start of the second range to immediately follow the end of the first one */ 448 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte); 449 450 /* set the count values (informational) */ 451 ranges[0].count=count1; 452 ranges[1].count=count2; 453 454 ranges[0].count2=count1*power_1; 455 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */ 456 457 /* lengthen the second range to maxLength */ 458 lengthenRange(ranges+1, maxByte, countBytes); 459 } 460 break; 461 } 462 463 /* no good match, lengthen all minLength ranges and iterate */ 464#ifdef UCOL_DEBUG 465 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1); 466#endif 467 for(i=0; ranges[i].length2==minLength; ++i) { 468 lengthenRange(ranges+i, maxByte, countBytes); 469 } 470 } 471 472 if(rangeCount>1) { 473 /* sort the ranges by weight values */ 474 UErrorCode errorCode=U_ZERO_ERROR; 475 uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), compareRanges, NULL, FALSE, &errorCode); 476 /* ignore error code: we know that the internal sort function will not fail here */ 477 } 478 479#ifdef UCOL_DEBUG 480 puts("final ranges:"); 481 for(i=0; i<rangeCount; ++i) { 482 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n", 483 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2); 484 } 485#endif 486 487 /* set maxByte in ranges[0] for ucol_nextWeight() */ 488 ranges[0].count=maxByte; 489 490 return rangeCount; 491} 492 493/* 494 * given a set of ranges calculated by ucol_allocWeights(), 495 * iterate through the weights 496 */ 497U_CFUNC uint32_t 498ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) { 499 if(*pRangeCount<=0) { 500 return 0xffffffff; 501 } else { 502 uint32_t weight, maxByte; 503 504 /* get maxByte from the .count field */ 505 maxByte=ranges[0].count; 506 507 /* get the next weight */ 508 weight=ranges[0].start; 509 if(weight==ranges[0].end) { 510 /* this range is finished, remove it and move the following ones up */ 511 if(--*pRangeCount>0) { 512 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange)); 513 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */ 514 } 515 } else { 516 /* increment the weight for the next value */ 517 ranges[0].start=incWeight(weight, ranges[0].length2, maxByte); 518 } 519 520 return weight; 521 } 522} 523 524#if 0 // #ifdef UCOL_DEBUG 525 526static void 527testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) { 528 WeightRange ranges[8]; 529 int32_t rangeCount; 530 531 rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges); 532 if(enumerate) { 533 uint32_t weight; 534 535 while(n>0) { 536 weight=ucol_nextWeight(ranges, &rangeCount); 537 if(weight==0xffffffff) { 538 printf("error: 0xffffffff with %lu more weights to go\n", n); 539 break; 540 } 541 printf(" 0x%08lx\n", weight); 542 --n; 543 } 544 } 545} 546 547extern int 548main(int argc, const char *argv[]) { 549#if 0 550#endif 551 testAlloc(0x364214fc, 0x44b87d23, 5, FALSE); 552 testAlloc(0x36421500, 0x44b87d23, 5, FALSE); 553 testAlloc(0x36421500, 0x44b87d23, 20, FALSE); 554 testAlloc(0x36421500, 0x44b87d23, 13700, FALSE); 555 testAlloc(0x36421500, 0x38b87d23, 1, FALSE); 556 testAlloc(0x36421500, 0x38b87d23, 20, FALSE); 557 testAlloc(0x36421500, 0x38b87d23, 200, TRUE); 558 testAlloc(0x36421500, 0x38b87d23, 13700, FALSE); 559 testAlloc(0x36421500, 0x37b87d23, 13700, FALSE); 560 testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE); 561 testAlloc(0x36421500, 0x36b87d23, 13700, FALSE); 562 testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE); 563 testAlloc(0x49000000, 0x4a600000, 13700, FALSE); 564 testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE); 565 testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE); 566 testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE); 567 testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE); 568 testAlloc(0xa0000000, 0xa0030000, 40000, FALSE); 569 testAlloc(0xa0031100, 0xa0030000, 40000, FALSE); 570#if 0 571#endif 572 return 0; 573} 574 575#endif 576 577#endif /* #if !UCONFIG_NO_COLLATION */ 578