CollationWeights.java revision bd1cbb618dcaa1ac6ba7c77dece35cb79593a5d7
1/* 2******************************************************************************* 3* 4* Copyright (C) 1999-2015, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* CollationWeights.java, ported from collationweights.h/.cpp 9* 10* C++ version created on: 2001mar08 as ucol_wgt.h 11* created by: Markus W. Scherer 12*/ 13 14package com.ibm.icu.impl.coll; 15 16import java.util.Arrays; 17 18/** 19 * Allocates n collation element weights between two exclusive limits. 20 * Used only internally by the collation tailoring builder. 21 */ 22public final class CollationWeights { 23 public CollationWeights() {} 24 25 public void initForPrimary(boolean compressible) { 26 middleLength=1; 27 minBytes[1] = Collation.MERGE_SEPARATOR_BYTE + 1; 28 maxBytes[1] = Collation.TRAIL_WEIGHT_BYTE; 29 if(compressible) { 30 minBytes[2] = Collation.PRIMARY_COMPRESSION_LOW_BYTE + 1; 31 maxBytes[2] = Collation.PRIMARY_COMPRESSION_HIGH_BYTE - 1; 32 } else { 33 minBytes[2] = 2; 34 maxBytes[2] = 0xff; 35 } 36 minBytes[3] = 2; 37 maxBytes[3] = 0xff; 38 minBytes[4] = 2; 39 maxBytes[4] = 0xff; 40 } 41 42 public void initForSecondary() { 43 // We use only the lower 16 bits for secondary weights. 44 middleLength=3; 45 minBytes[1] = 0; 46 maxBytes[1] = 0; 47 minBytes[2] = 0; 48 maxBytes[2] = 0; 49 minBytes[3] = Collation.LEVEL_SEPARATOR_BYTE + 1; 50 maxBytes[3] = 0xff; 51 minBytes[4] = 2; 52 maxBytes[4] = 0xff; 53 } 54 55 public void initForTertiary() { 56 // We use only the lower 16 bits for tertiary weights. 57 middleLength=3; 58 minBytes[1] = 0; 59 maxBytes[1] = 0; 60 minBytes[2] = 0; 61 maxBytes[2] = 0; 62 // We use only 6 bits per byte. 63 // The other bits are used for case & quaternary weights. 64 minBytes[3] = Collation.LEVEL_SEPARATOR_BYTE + 1; 65 maxBytes[3] = 0x3f; 66 minBytes[4] = 2; 67 maxBytes[4] = 0x3f; 68 } 69 70 /** 71 * Determine heuristically 72 * what ranges to use for a given number of weights between (excluding) 73 * two limits. 74 * 75 * @param lowerLimit A collation element weight; the ranges will be filled to cover 76 * weights greater than this one. 77 * @param upperLimit A collation element weight; the ranges will be filled to cover 78 * weights less than this one. 79 * @param n The number of collation element weights w necessary such that 80 * lowerLimit<w<upperLimit in lexical order. 81 * @return true if it is possible to fit n elements between the limits 82 */ 83 public boolean allocWeights(long lowerLimit, long upperLimit, int n) { 84 // Call getWeightRanges() and then determine heuristically 85 // which ranges to use for a given number of weights between (excluding) 86 // two limits. 87 // puts(""); 88 89 if(!getWeightRanges(lowerLimit, upperLimit)) { 90 // printf("error: unable to get Weight ranges\n"); 91 return false; 92 } 93 94 /* try until we find suitably large ranges */ 95 for(;;) { 96 /* get the smallest number of bytes in a range */ 97 int minLength=ranges[0].length; 98 99 if(allocWeightsInShortRanges(n, minLength)) { break; } 100 101 if(minLength == 4) { 102 // printf("error: the maximum number of %ld weights is insufficient for n=%ld\n", 103 // minLengthCount, n); 104 return false; 105 } 106 107 if(allocWeightsInMinLengthRanges(n, minLength)) { break; } 108 109 /* no good match, lengthen all minLength ranges and iterate */ 110 // printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1); 111 for(int i=0; ranges[i].length==minLength; ++i) { 112 lengthenRange(ranges[i]); 113 } 114 } 115 116 /* puts("final ranges:"); 117 for(int i=0; i<rangeCount; ++i) { 118 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n", 119 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count); 120 } */ 121 122 rangeIndex = 0; 123 if(rangeCount < ranges.length) { 124 ranges[rangeCount] = null; // force a crash when going out of bounds 125 } 126 return true; 127 } 128 129 /** 130 * Given a set of ranges calculated by allocWeights(), 131 * iterate through the weights. 132 * The ranges are modified to keep the current iteration state. 133 * 134 * @return The next weight in the ranges, or 0xffffffff if there is none left. 135 */ 136 public long nextWeight() { 137 if(rangeIndex >= rangeCount) { 138 return 0xffffffffL; 139 } else { 140 /* get the next weight */ 141 WeightRange range = ranges[rangeIndex]; 142 long weight = range.start; 143 if(--range.count == 0) { 144 /* this range is finished */ 145 ++rangeIndex; 146 } else { 147 /* increment the weight for the next value */ 148 range.start = incWeight(weight, range.length); 149 assert(range.start <= range.end); 150 } 151 152 return weight; 153 } 154 } 155 156 /** @internal */ 157 private static final class WeightRange implements Comparable<WeightRange> { 158 long start, end; 159 int length, count; 160 161 // Java 6: @Override 162 public int compareTo(WeightRange other) { 163 long l=start; 164 long r=other.start; 165 if(l<r) { 166 return -1; 167 } else if(l>r) { 168 return 1; 169 } else { 170 return 0; 171 } 172 } 173 } 174 175 /* helper functions for CE weights */ 176 177 public static int lengthOfWeight(long weight) { 178 if((weight&0xffffff)==0) { 179 return 1; 180 } else if((weight&0xffff)==0) { 181 return 2; 182 } else if((weight&0xff)==0) { 183 return 3; 184 } else { 185 return 4; 186 } 187 } 188 189 private static int getWeightTrail(long weight, int length) { 190 return (int)(weight>>(8*(4-length)))&0xff; 191 } 192 193 private static long setWeightTrail(long weight, int length, int trail) { 194 length=8*(4-length); 195 return (weight&(0xffffff00L<<length))|((long)trail<<length); 196 } 197 198 private static int getWeightByte(long weight, int idx) { 199 return getWeightTrail(weight, idx); /* same calculation */ 200 } 201 202 private static long setWeightByte(long weight, int idx, int b) { 203 long mask; /* 0xffffffff except a 00 "hole" for the index-th byte */ 204 205 idx*=8; 206 if(idx<32) { 207 mask=0xffffffffL>>idx; 208 } else { 209 // Do not use int>>32 because on some platforms that does not shift at all 210 // while we need it to become 0. 211 // PowerPC: 0xffffffff>>32 = 0 (wanted) 212 // x86: 0xffffffff>>32 = 0xffffffff (not wanted) 213 // 214 // ANSI C99 6.5.7 Bitwise shift operators: 215 // "If the value of the right operand is negative 216 // or is greater than or equal to the width of the promoted left operand, 217 // the behavior is undefined." 218 mask=0; 219 } 220 idx=32-idx; 221 mask|=0xffffff00L<<idx; 222 return (weight&mask)|((long)b<<idx); 223 } 224 225 private static long truncateWeight(long weight, int length) { 226 return weight&(0xffffffffL<<(8*(4-length))); 227 } 228 229 private static long incWeightTrail(long weight, int length) { 230 return weight+(1L<<(8*(4-length))); 231 } 232 233 private static long decWeightTrail(long weight, int length) { 234 return weight-(1L<<(8*(4-length))); 235 } 236 237 /** @return number of usable byte values for byte idx */ 238 private int countBytes(int idx) { 239 return maxBytes[idx] - minBytes[idx] + 1; 240 } 241 242 private long incWeight(long weight, int length) { 243 for(;;) { 244 int b=getWeightByte(weight, length); 245 if(b<maxBytes[length]) { 246 return setWeightByte(weight, length, b+1); 247 } else { 248 // Roll over, set this byte to the minimum and increment the previous one. 249 weight=setWeightByte(weight, length, minBytes[length]); 250 --length; 251 assert(length > 0); 252 } 253 } 254 } 255 256 private long incWeightByOffset(long weight, int length, int offset) { 257 for(;;) { 258 offset += getWeightByte(weight, length); 259 if(offset <= maxBytes[length]) { 260 return setWeightByte(weight, length, offset); 261 } else { 262 // Split the offset between this byte and the previous one. 263 offset -= minBytes[length]; 264 weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length)); 265 offset /= countBytes(length); 266 --length; 267 assert(length > 0); 268 } 269 } 270 } 271 272 private void lengthenRange(WeightRange range) { 273 int length=range.length+1; 274 range.start=setWeightTrail(range.start, length, minBytes[length]); 275 range.end=setWeightTrail(range.end, length, maxBytes[length]); 276 range.count*=countBytes(length); 277 range.length=length; 278 } 279 280 /** 281 * Takes two CE weights and calculates the 282 * possible ranges of weights between the two limits, excluding them. 283 * For weights with up to 4 bytes there are up to 2*4-1=7 ranges. 284 */ 285 private boolean getWeightRanges(long lowerLimit, long upperLimit) { 286 assert(lowerLimit != 0); 287 assert(upperLimit != 0); 288 289 /* get the lengths of the limits */ 290 int lowerLength=lengthOfWeight(lowerLimit); 291 int upperLength=lengthOfWeight(upperLimit); 292 293 // printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength); 294 // printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength); 295 assert(lowerLength>=middleLength); 296 // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000. 297 298 if(lowerLimit>=upperLimit) { 299 // printf("error: no space between lower & upper limits\n"); 300 return false; 301 } 302 303 /* check that neither is a prefix of the other */ 304 if(lowerLength<upperLength) { 305 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) { 306 // printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit); 307 return false; 308 } 309 } 310 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */ 311 312 WeightRange[] lower = new WeightRange[5]; /* [0] and [1] are not used - this simplifies indexing */ 313 WeightRange middle = new WeightRange(); 314 WeightRange[] upper = new WeightRange[5]; 315 316 /* 317 * With the limit lengths of 1..4, there are up to 7 ranges for allocation: 318 * range minimum length 319 * lower[4] 4 320 * lower[3] 3 321 * lower[2] 2 322 * middle 1 323 * upper[2] 2 324 * upper[3] 3 325 * upper[4] 4 326 * 327 * We are now going to calculate up to 7 ranges. 328 * Some of them will typically overlap, so we will then have to merge and eliminate ranges. 329 */ 330 long weight=lowerLimit; 331 for(int length=lowerLength; length>middleLength; --length) { 332 int trail=getWeightTrail(weight, length); 333 if(trail<maxBytes[length]) { 334 lower[length] = new WeightRange(); 335 lower[length].start=incWeightTrail(weight, length); 336 lower[length].end=setWeightTrail(weight, length, maxBytes[length]); 337 lower[length].length=length; 338 lower[length].count=maxBytes[length]-trail; 339 } 340 weight=truncateWeight(weight, length-1); 341 } 342 if(weight<0xff000000L) { 343 middle.start=incWeightTrail(weight, middleLength); 344 } else { 345 // Prevent overflow for primary lead byte FF 346 // which would yield a middle range starting at 0. 347 middle.start=0xffffffffL; // no middle range 348 } 349 350 weight=upperLimit; 351 for(int length=upperLength; length>middleLength; --length) { 352 int trail=getWeightTrail(weight, length); 353 if(trail>minBytes[length]) { 354 upper[length] = new WeightRange(); 355 upper[length].start=setWeightTrail(weight, length, minBytes[length]); 356 upper[length].end=decWeightTrail(weight, length); 357 upper[length].length=length; 358 upper[length].count=trail-minBytes[length]; 359 } 360 weight=truncateWeight(weight, length-1); 361 } 362 middle.end=decWeightTrail(weight, middleLength); 363 364 /* set the middle range */ 365 middle.length=middleLength; 366 if(middle.end>=middle.start) { 367 middle.count=(int)((middle.end-middle.start)>>(8*(4-middleLength)))+1; 368 } else { 369 /* no middle range, eliminate overlaps */ 370 for(int length=4; length>middleLength; --length) { 371 if(lower[length] != null && upper[length] != null && 372 lower[length].count>0 && upper[length].count>0) { 373 // Note: The lowerEnd and upperStart weights are versions of 374 // lowerLimit and upperLimit (which are lowerLimit<upperLimit), 375 // truncated (still less-or-equal) 376 // and then with their last bytes changed to the 377 // maxByte (for lowerEnd) or minByte (for upperStart). 378 final long lowerEnd=lower[length].end; 379 final long upperStart=upper[length].start; 380 boolean merged=false; 381 382 if(lowerEnd>upperStart) { 383 // These two lower and upper ranges collide. 384 // Since lowerLimit<upperLimit and lowerEnd and upperStart 385 // are versions with only their last bytes modified 386 // (and following ones removed/reset to 0), 387 // lowerEnd>upperStart is only possible 388 // if the leading bytes are equal 389 // and lastByte(lowerEnd)>lastByte(upperStart). 390 assert(truncateWeight(lowerEnd, length-1)== 391 truncateWeight(upperStart, length-1)); 392 // Intersect these two ranges. 393 lower[length].end=upper[length].end; 394 lower[length].count= 395 getWeightTrail(lower[length].end, length)- 396 getWeightTrail(lower[length].start, length)+1; 397 // count might be <=0 in which case there is no room, 398 // and the range-collecting code below will ignore this range. 399 merged=true; 400 } else if(lowerEnd==upperStart) { 401 // Not possible, unless minByte==maxByte which is not allowed. 402 assert(minBytes[length]<maxBytes[length]); 403 } else /* lowerEnd<upperStart */ { 404 if(incWeight(lowerEnd, length)==upperStart) { 405 // Merge adjacent ranges. 406 lower[length].end=upper[length].end; 407 lower[length].count+=upper[length].count; // might be >countBytes 408 merged=true; 409 } 410 } 411 if(merged) { 412 // Remove all shorter ranges. 413 // There was no room available for them between the ranges we just merged. 414 upper[length].count=0; 415 while(--length>middleLength) { 416 lower[length]=upper[length]=null; 417 } 418 break; 419 } 420 } 421 } 422 } 423 424 /* print ranges 425 for(int length=4; length>=2; --length) { 426 if(lower[length].count>0) { 427 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count); 428 } 429 } 430 if(middle.count>0) { 431 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count); 432 } 433 for(int length=2; length<=4; ++length) { 434 if(upper[length].count>0) { 435 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count); 436 } 437 } */ 438 439 /* copy the ranges, shortest first, into the result array */ 440 rangeCount=0; 441 if(middle.count>0) { 442 ranges[0] = middle; 443 rangeCount=1; 444 } 445 for(int length=middleLength+1; length<=4; ++length) { 446 /* copy upper first so that later the middle range is more likely the first one to use */ 447 if(upper[length] != null && upper[length].count>0) { 448 ranges[rangeCount++]=upper[length]; 449 } 450 if(lower[length] != null && lower[length].count>0) { 451 ranges[rangeCount++]=lower[length]; 452 } 453 } 454 return rangeCount>0; 455 } 456 457 private boolean allocWeightsInShortRanges(int n, int minLength) { 458 // See if the first few minLength and minLength+1 ranges have enough weights. 459 for(int i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) { 460 if(n <= ranges[i].count) { 461 // Use the first few minLength and minLength+1 ranges. 462 if(ranges[i].length > minLength) { 463 // Reduce the number of weights from the last minLength+1 range 464 // which might sort before some minLength ranges, 465 // so that we use all weights in the minLength ranges. 466 ranges[i].count = n; 467 } 468 rangeCount = i + 1; 469 // printf("take first %ld ranges\n", rangeCount); 470 471 if(rangeCount>1) { 472 /* sort the ranges by weight values */ 473 Arrays.sort(ranges, 0, rangeCount); 474 } 475 return true; 476 } 477 n -= ranges[i].count; // still >0 478 } 479 return false; 480 } 481 482 private boolean allocWeightsInMinLengthRanges(int n, int minLength) { 483 // See if the minLength ranges have enough weights 484 // when we split one and lengthen the following ones. 485 int count = 0; 486 int minLengthRangeCount; 487 for(minLengthRangeCount = 0; 488 minLengthRangeCount < rangeCount && 489 ranges[minLengthRangeCount].length == minLength; 490 ++minLengthRangeCount) { 491 count += ranges[minLengthRangeCount].count; 492 } 493 494 int nextCountBytes = countBytes(minLength + 1); 495 if(n > count * nextCountBytes) { return false; } 496 497 // Use the minLength ranges. Merge them, and then split again as necessary. 498 long start = ranges[0].start; 499 long end = ranges[0].end; 500 for(int i = 1; i < minLengthRangeCount; ++i) { 501 if(ranges[i].start < start) { start = ranges[i].start; } 502 if(ranges[i].end > end) { end = ranges[i].end; } 503 } 504 505 // Calculate how to split the range between minLength (count1) and minLength+1 (count2). 506 // Goal: 507 // count1 + count2 * nextCountBytes = n 508 // count1 + count2 = count 509 // These turn into 510 // (count - count2) + count2 * nextCountBytes = n 511 // and then into the following count1 & count2 computations. 512 int count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened 513 int count1 = count - count2; // number of minLength weights 514 if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) { 515 // round up 516 ++count2; 517 --count1; 518 assert((count1 + count2 * nextCountBytes) >= n); 519 } 520 521 ranges[0].start = start; 522 523 if(count1 == 0) { 524 // Make one long range. 525 ranges[0].end = end; 526 ranges[0].count = count; 527 lengthenRange(ranges[0]); 528 rangeCount = 1; 529 } else { 530 // Split the range, lengthen the second part. 531 // printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n", 532 // splitRange, rangeCount, count1, count2); 533 534 // Next start = start + count1. First end = 1 before that. 535 ranges[0].end = incWeightByOffset(start, minLength, count1 - 1); 536 ranges[0].count = count1; 537 538 if(ranges[1] == null) { 539 ranges[1] = new WeightRange(); 540 } 541 ranges[1].start = incWeight(ranges[0].end, minLength); 542 ranges[1].end = end; 543 ranges[1].length = minLength; // +1 when lengthened 544 ranges[1].count = count2; // *countBytes when lengthened 545 lengthenRange(ranges[1]); 546 rangeCount = 2; 547 } 548 return true; 549 } 550 551 private int middleLength; 552 private int[] minBytes = new int[5]; // for byte 1, 2, 3, 4 553 private int[] maxBytes = new int[5]; 554 private WeightRange[] ranges = new WeightRange[7]; 555 private int rangeIndex; 556 private int rangeCount; 557} 558