1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4******************************************************************************* 5* Copyright (C) 2013-2014, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* CollationRootElements.java, ported from collationrootelements.h/.cpp 9* 10* C++ version created on: 2013mar01 11* created by: Markus W. Scherer 12*/ 13 14package com.ibm.icu.impl.coll; 15 16/** 17 * Container and access methods for collation elements and weights 18 * that occur in the root collator. 19 * Needed for finding boundaries for building a tailoring. 20 * 21 * This class takes and returns 16-bit secondary and tertiary weights. 22 */ 23public final class CollationRootElements { 24 public CollationRootElements(long[] rootElements) { 25 elements = rootElements; 26 } 27 28 /** 29 * Higher than any root primary. 30 */ 31 public static final long PRIMARY_SENTINEL = 0xffffff00L; 32 33 /** 34 * Flag in a root element, set if the element contains secondary & tertiary weights, 35 * rather than a primary. 36 */ 37 public static final int SEC_TER_DELTA_FLAG = 0x80; 38 /** 39 * Mask for getting the primary range step value from a primary-range-end element. 40 */ 41 public static final int PRIMARY_STEP_MASK = 0x7f; 42 43 /** 44 * Index of the first CE with a non-zero tertiary weight. 45 * Same as the start of the compact root elements table. 46 */ 47 public static final int IX_FIRST_TERTIARY_INDEX = 0; 48 /** 49 * Index of the first CE with a non-zero secondary weight. 50 */ 51 static final int IX_FIRST_SECONDARY_INDEX = 1; 52 /** 53 * Index of the first CE with a non-zero primary weight. 54 */ 55 static final int IX_FIRST_PRIMARY_INDEX = 2; 56 /** 57 * Must match Collation.COMMON_SEC_AND_TER_CE. 58 */ 59 static final int IX_COMMON_SEC_AND_TER_CE = 3; 60 /** 61 * Secondary & tertiary boundaries. 62 * Bits 31..24: [fixed last secondary common byte 45] 63 * Bits 23..16: [fixed first ignorable secondary byte 80] 64 * Bits 15.. 8: reserved, 0 65 * Bits 7.. 0: [fixed first ignorable tertiary byte 3C] 66 */ 67 static final int IX_SEC_TER_BOUNDARIES = 4; 68 /** 69 * The current number of indexes. 70 * Currently the same as elements[IX_FIRST_TERTIARY_INDEX]. 71 */ 72 static final int IX_COUNT = 5; 73 74 /** 75 * Returns the boundary between tertiary weights of primary/secondary CEs 76 * and those of tertiary CEs. 77 * This is the upper limit for tertiaries of primary/secondary CEs. 78 * This minus one is the lower limit for tertiaries of tertiary CEs. 79 */ 80 public int getTertiaryBoundary() { 81 return ((int)elements[IX_SEC_TER_BOUNDARIES] << 8) & 0xff00; 82 } 83 84 /** 85 * Returns the first assigned tertiary CE. 86 */ 87 long getFirstTertiaryCE() { 88 return elements[(int)elements[IX_FIRST_TERTIARY_INDEX]] & ~SEC_TER_DELTA_FLAG; 89 } 90 91 /** 92 * Returns the last assigned tertiary CE. 93 */ 94 long getLastTertiaryCE() { 95 return elements[(int)elements[IX_FIRST_SECONDARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG; 96 } 97 98 /** 99 * Returns the last common secondary weight. 100 * This is the lower limit for secondaries of primary CEs. 101 */ 102 public int getLastCommonSecondary() { 103 return ((int)elements[IX_SEC_TER_BOUNDARIES] >> 16) & 0xff00; 104 } 105 106 /** 107 * Returns the boundary between secondary weights of primary CEs 108 * and those of secondary CEs. 109 * This is the upper limit for secondaries of primary CEs. 110 * This minus one is the lower limit for secondaries of secondary CEs. 111 */ 112 public int getSecondaryBoundary() { 113 return ((int)elements[IX_SEC_TER_BOUNDARIES] >> 8) & 0xff00; 114 } 115 116 /** 117 * Returns the first assigned secondary CE. 118 */ 119 long getFirstSecondaryCE() { 120 return elements[(int)elements[IX_FIRST_SECONDARY_INDEX]] & ~SEC_TER_DELTA_FLAG; 121 } 122 123 /** 124 * Returns the last assigned secondary CE. 125 */ 126 long getLastSecondaryCE() { 127 return elements[(int)elements[IX_FIRST_PRIMARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG; 128 } 129 130 /** 131 * Returns the first assigned primary weight. 132 */ 133 long getFirstPrimary() { 134 return elements[(int)elements[IX_FIRST_PRIMARY_INDEX]]; // step=0: cannot be a range end 135 } 136 137 /** 138 * Returns the first assigned primary CE. 139 */ 140 long getFirstPrimaryCE() { 141 return Collation.makeCE(getFirstPrimary()); 142 } 143 144 /** 145 * Returns the last root CE with a primary weight before p. 146 * Intended only for reordering group boundaries. 147 */ 148 long lastCEWithPrimaryBefore(long p) { 149 if(p == 0) { return 0; } 150 assert(p > elements[(int)elements[IX_FIRST_PRIMARY_INDEX]]); 151 int index = findP(p); 152 long q = elements[index]; 153 long secTer; 154 if(p == (q & 0xffffff00L)) { 155 // p == elements[index] is a root primary. Find the CE before it. 156 // We must not be in a primary range. 157 assert((q & PRIMARY_STEP_MASK) == 0); 158 secTer = elements[index - 1]; 159 if((secTer & SEC_TER_DELTA_FLAG) == 0) { 160 // Primary CE just before p. 161 p = secTer & 0xffffff00L; 162 secTer = Collation.COMMON_SEC_AND_TER_CE; 163 } else { 164 // secTer = last secondary & tertiary for the previous primary 165 index -= 2; 166 for(;;) { 167 p = elements[index]; 168 if((p & SEC_TER_DELTA_FLAG) == 0) { 169 p &= 0xffffff00L; 170 break; 171 } 172 --index; 173 } 174 } 175 } else { 176 // p > elements[index] which is the previous primary. 177 // Find the last secondary & tertiary weights for it. 178 p = q & 0xffffff00L; 179 secTer = Collation.COMMON_SEC_AND_TER_CE; 180 for(;;) { 181 q = elements[++index]; 182 if((q & SEC_TER_DELTA_FLAG) == 0) { 183 // We must not be in a primary range. 184 assert((q & PRIMARY_STEP_MASK) == 0); 185 break; 186 } 187 secTer = q; 188 } 189 } 190 return (p << 32) | (secTer & ~SEC_TER_DELTA_FLAG); 191 } 192 193 /** 194 * Returns the first root CE with a primary weight of at least p. 195 * Intended only for reordering group boundaries. 196 */ 197 long firstCEWithPrimaryAtLeast(long p) { 198 if(p == 0) { return 0; } 199 int index = findP(p); 200 if(p != (elements[index] & 0xffffff00L)) { 201 for(;;) { 202 p = elements[++index]; 203 if((p & SEC_TER_DELTA_FLAG) == 0) { 204 // First primary after p. We must not be in a primary range. 205 assert((p & PRIMARY_STEP_MASK) == 0); 206 break; 207 } 208 } 209 } 210 // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0. 211 return (p << 32) | Collation.COMMON_SEC_AND_TER_CE; 212 } 213 214 /** 215 * Returns the primary weight before p. 216 * p must be greater than the first root primary. 217 */ 218 long getPrimaryBefore(long p, boolean isCompressible) { 219 int index = findPrimary(p); 220 int step; 221 long q = elements[index]; 222 if(p == (q & 0xffffff00L)) { 223 // Found p itself. Return the previous primary. 224 // See if p is at the end of a previous range. 225 step = (int)q & PRIMARY_STEP_MASK; 226 if(step == 0) { 227 // p is not at the end of a range. Look for the previous primary. 228 do { 229 p = elements[--index]; 230 } while((p & SEC_TER_DELTA_FLAG) != 0); 231 return p & 0xffffff00L; 232 } 233 } else { 234 // p is in a range, and not at the start. 235 long nextElement = elements[index + 1]; 236 assert(isEndOfPrimaryRange(nextElement)); 237 step = (int)nextElement & PRIMARY_STEP_MASK; 238 } 239 // Return the previous range primary. 240 if((p & 0xffff) == 0) { 241 return Collation.decTwoBytePrimaryByOneStep(p, isCompressible, step); 242 } else { 243 return Collation.decThreeBytePrimaryByOneStep(p, isCompressible, step); 244 } 245 } 246 247 /** Returns the secondary weight before [p, s]. */ 248 int getSecondaryBefore(long p, int s) { 249 int index; 250 int previousSec, sec; 251 if(p == 0) { 252 index = (int)elements[IX_FIRST_SECONDARY_INDEX]; 253 // Gap at the beginning of the secondary CE range. 254 previousSec = 0; 255 sec = (int)(elements[index] >> 16); 256 } else { 257 index = findPrimary(p) + 1; 258 previousSec = Collation.BEFORE_WEIGHT16; 259 sec = (int)getFirstSecTerForPrimary(index) >>> 16; 260 } 261 assert(s >= sec); 262 while(s > sec) { 263 previousSec = sec; 264 assert((elements[index] & SEC_TER_DELTA_FLAG) != 0); 265 sec = (int)(elements[index++] >> 16); 266 } 267 assert(sec == s); 268 return previousSec; 269 } 270 271 /** Returns the tertiary weight before [p, s, t]. */ 272 int getTertiaryBefore(long p, int s, int t) { 273 assert((t & ~Collation.ONLY_TERTIARY_MASK) == 0); 274 int index; 275 int previousTer; 276 long secTer; 277 if(p == 0) { 278 if(s == 0) { 279 index = (int)elements[IX_FIRST_TERTIARY_INDEX]; 280 // Gap at the beginning of the tertiary CE range. 281 previousTer = 0; 282 } else { 283 index = (int)elements[IX_FIRST_SECONDARY_INDEX]; 284 previousTer = Collation.BEFORE_WEIGHT16; 285 } 286 secTer = elements[index] & ~SEC_TER_DELTA_FLAG; 287 } else { 288 index = findPrimary(p) + 1; 289 previousTer = Collation.BEFORE_WEIGHT16; 290 secTer = getFirstSecTerForPrimary(index); 291 } 292 long st = ((long)s << 16) | t; 293 while(st > secTer) { 294 if((int)(secTer >> 16) == s) { previousTer = (int)secTer; } 295 assert((elements[index] & SEC_TER_DELTA_FLAG) != 0); 296 secTer = elements[index++] & ~SEC_TER_DELTA_FLAG; 297 } 298 assert(secTer == st); 299 return previousTer & 0xffff; 300 } 301 302 /** 303 * Finds the index of the input primary. 304 * p must occur as a root primary, and must not be 0. 305 */ 306 int findPrimary(long p) { 307 // Requirement: p must occur as a root primary. 308 assert((p & 0xff) == 0); // at most a 3-byte primary 309 int index = findP(p); 310 // If p is in a range, then we just assume that p is an actual primary in this range. 311 // (Too cumbersome/expensive to check.) 312 // Otherwise, it must be an exact match. 313 assert(isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00L)); 314 return index; 315 } 316 317 /** 318 * Returns the primary weight after p where index=findPrimary(p). 319 * p must be at least the first root primary. 320 */ 321 long getPrimaryAfter(long p, int index, boolean isCompressible) { 322 assert(p == (elements[index] & 0xffffff00L) || isEndOfPrimaryRange(elements[index + 1])); 323 long q = elements[++index]; 324 int step; 325 if((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int)q & PRIMARY_STEP_MASK) != 0) { 326 // Return the next primary in this range. 327 if((p & 0xffff) == 0) { 328 return Collation.incTwoBytePrimaryByOffset(p, isCompressible, step); 329 } else { 330 return Collation.incThreeBytePrimaryByOffset(p, isCompressible, step); 331 } 332 } else { 333 // Return the next primary in the list. 334 while((q & SEC_TER_DELTA_FLAG) != 0) { 335 q = elements[++index]; 336 } 337 assert((q & PRIMARY_STEP_MASK) == 0); 338 return q; 339 } 340 } 341 /** 342 * Returns the secondary weight after [p, s] where index=findPrimary(p) 343 * except use index=0 for p=0. 344 * 345 * <p>Must return a weight for every root [p, s] as well as for every weight 346 * returned by getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16. 347 * 348 * <p>Exception: [0, 0] is handled by the CollationBuilder: 349 * Both its lower and upper boundaries are special. 350 */ 351 int getSecondaryAfter(int index, int s) { 352 long secTer; 353 int secLimit; 354 if(index == 0) { 355 // primary = 0 356 assert(s != 0); 357 index = (int)elements[IX_FIRST_SECONDARY_INDEX]; 358 secTer = elements[index]; 359 // Gap at the end of the secondary CE range. 360 secLimit = 0x10000; 361 } else { 362 assert(index >= (int)elements[IX_FIRST_PRIMARY_INDEX]); 363 secTer = getFirstSecTerForPrimary(index + 1); 364 // If this is an explicit sec/ter unit, then it will be read once more. 365 // Gap for secondaries of primary CEs. 366 secLimit = getSecondaryBoundary(); 367 } 368 for(;;) { 369 int sec = (int)(secTer >> 16); 370 if(sec > s) { return sec; } 371 secTer = elements[++index]; 372 if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; } 373 } 374 } 375 /** 376 * Returns the tertiary weight after [p, s, t] where index=findPrimary(p) 377 * except use index=0 for p=0. 378 * 379 * <p>Must return a weight for every root [p, s, t] as well as for every weight 380 * returned by getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16. 381 * 382 * <p>Exception: [0, 0, 0] is handled by the CollationBuilder: 383 * Both its lower and upper boundaries are special. 384 */ 385 int getTertiaryAfter(int index, int s, int t) { 386 long secTer; 387 int terLimit; 388 if(index == 0) { 389 // primary = 0 390 if(s == 0) { 391 assert(t != 0); 392 index = (int)elements[IX_FIRST_TERTIARY_INDEX]; 393 // Gap at the end of the tertiary CE range. 394 terLimit = 0x4000; 395 } else { 396 index = (int)elements[IX_FIRST_SECONDARY_INDEX]; 397 // Gap for tertiaries of primary/secondary CEs. 398 terLimit = getTertiaryBoundary(); 399 } 400 secTer = elements[index] & ~SEC_TER_DELTA_FLAG; 401 } else { 402 assert(index >= (int)elements[IX_FIRST_PRIMARY_INDEX]); 403 secTer = getFirstSecTerForPrimary(index + 1); 404 // If this is an explicit sec/ter unit, then it will be read once more. 405 terLimit = getTertiaryBoundary(); 406 } 407 long st = (((long)s & 0xffffffffL) << 16) | t; 408 for(;;) { 409 if(secTer > st) { 410 assert((secTer >> 16) == s); 411 return (int)secTer & 0xffff; 412 } 413 secTer = elements[++index]; 414 // No tertiary greater than t for this primary+secondary. 415 if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; } 416 secTer &= ~SEC_TER_DELTA_FLAG; 417 } 418 } 419 420 /** 421 * Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1. 422 */ 423 private long getFirstSecTerForPrimary(int index) { 424 long secTer = elements[index]; 425 if((secTer & SEC_TER_DELTA_FLAG) == 0) { 426 // No sec/ter delta. 427 return Collation.COMMON_SEC_AND_TER_CE; 428 } 429 secTer &= ~SEC_TER_DELTA_FLAG; 430 if(secTer > Collation.COMMON_SEC_AND_TER_CE) { 431 // Implied sec/ter. 432 return Collation.COMMON_SEC_AND_TER_CE; 433 } 434 // Explicit sec/ter below common/common. 435 return secTer; 436 } 437 438 /** 439 * Finds the largest index i where elements[i]<=p. 440 * Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL). 441 * Does not require that p is a root collator primary. 442 */ 443 private int findP(long p) { 444 // p need not occur as a root primary. 445 // For example, it might be a reordering group boundary. 446 assert((p >> 24) != Collation.UNASSIGNED_IMPLICIT_BYTE); 447 // modified binary search 448 int start = (int)elements[IX_FIRST_PRIMARY_INDEX]; 449 assert(p >= elements[start]); 450 int limit = elements.length - 1; 451 assert(elements[limit] >= PRIMARY_SENTINEL); 452 assert(p < elements[limit]); 453 while((start + 1) < limit) { 454 // Invariant: elements[start] and elements[limit] are primaries, 455 // and elements[start]<=p<=elements[limit]. 456 int i = (int)(((long)start + (long)limit) / 2); 457 long q = elements[i]; 458 if((q & SEC_TER_DELTA_FLAG) != 0) { 459 // Find the next primary. 460 int j = i + 1; 461 for(;;) { 462 if(j == limit) { break; } 463 q = elements[j]; 464 if((q & SEC_TER_DELTA_FLAG) == 0) { 465 i = j; 466 break; 467 } 468 ++j; 469 } 470 if((q & SEC_TER_DELTA_FLAG) != 0) { 471 // Find the preceding primary. 472 j = i - 1; 473 for(;;) { 474 if(j == start) { break; } 475 q = elements[j]; 476 if((q & SEC_TER_DELTA_FLAG) == 0) { 477 i = j; 478 break; 479 } 480 --j; 481 } 482 if((q & SEC_TER_DELTA_FLAG) != 0) { 483 // No primary between start and limit. 484 break; 485 } 486 } 487 } 488 if(p < (q & 0xffffff00L)) { // Reset the "step" bits of a range end primary. 489 limit = i; 490 } else { 491 start = i; 492 } 493 } 494 return start; 495 } 496 497 private static boolean isEndOfPrimaryRange(long q) { 498 return (q & SEC_TER_DELTA_FLAG) == 0 && (q & PRIMARY_STEP_MASK) != 0; 499 } 500 501 /** 502 * Data structure: See ICU4C source/i18n/collationrootelements.h. 503 */ 504 private long[] elements; 505} 506