1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/* 3******************************************************************************* 4* Copyright (C) 2013-2015, International Business Machines 5* Corporation and others. All Rights Reserved. 6******************************************************************************* 7* CollationBuilder.java, ported from collationbuilder.h/.cpp 8* 9* C++ version created on: 2013may06 10* created by: Markus W. Scherer 11*/ 12 13package android.icu.impl.coll; 14 15import java.text.ParseException; 16 17import android.icu.impl.Norm2AllModes; 18import android.icu.impl.Normalizer2Impl; 19import android.icu.impl.Normalizer2Impl.Hangul; 20import android.icu.lang.UScript; 21import android.icu.text.CanonicalIterator; 22import android.icu.text.Collator; 23import android.icu.text.Normalizer2; 24import android.icu.text.UnicodeSet; 25import android.icu.text.UnicodeSetIterator; 26import android.icu.util.ULocale; 27 28/** 29 * @hide Only a subset of ICU is exposed in Android 30 */ 31public final class CollationBuilder extends CollationRuleParser.Sink { 32 private static final boolean DEBUG = false; 33 private static final class BundleImporter implements CollationRuleParser.Importer { 34 BundleImporter() {} 35 public String getRules(String localeID, String collationType) { 36 return CollationLoader.loadRules(new ULocale(localeID), collationType); 37 } 38 } 39 40 public CollationBuilder(CollationTailoring b) { 41 nfd = Normalizer2.getNFDInstance(); 42 fcd = Norm2AllModes.getFCDNormalizer2(); 43 nfcImpl = Norm2AllModes.getNFCInstance().impl; 44 base = b; 45 baseData = b.data; 46 rootElements = new CollationRootElements(b.data.rootElements); 47 variableTop = 0; 48 dataBuilder = new CollationDataBuilder(); 49 fastLatinEnabled = true; 50 cesLength = 0; 51 rootPrimaryIndexes = new UVector32(); 52 nodes = new UVector64(); 53 nfcImpl.ensureCanonIterData(); 54 dataBuilder.initForTailoring(baseData); 55 } 56 57 public CollationTailoring parseAndBuild(String ruleString) throws ParseException { 58 if(baseData.rootElements == null) { 59 // C++ U_MISSING_RESOURCE_ERROR 60 throw new UnsupportedOperationException( 61 "missing root elements data, tailoring not supported"); 62 } 63 CollationTailoring tailoring = new CollationTailoring(base.settings); 64 CollationRuleParser parser = new CollationRuleParser(baseData); 65 // Note: This always bases &[last variable] and &[first regular] 66 // on the root collator's maxVariable/variableTop. 67 // If we wanted this to change after [maxVariable x], then we would keep 68 // the tailoring.settings pointer here and read its variableTop when we need it. 69 // See http://unicode.org/cldr/trac/ticket/6070 70 variableTop = base.settings.readOnly().variableTop; 71 parser.setSink(this); 72 // In Java, there is only one Importer implementation. 73 // In C++, the importer is a parameter for this method. 74 parser.setImporter(new BundleImporter()); 75 CollationSettings ownedSettings = tailoring.settings.copyOnWrite(); 76 parser.parse(ruleString, ownedSettings); 77 if(dataBuilder.hasMappings()) { 78 makeTailoredCEs(); 79 closeOverComposites(); 80 finalizeCEs(); 81 // Copy all of ASCII, and Latin-1 letters, into each tailoring. 82 optimizeSet.add(0, 0x7f); 83 optimizeSet.add(0xc0, 0xff); 84 // Hangul is decomposed on the fly during collation, 85 // and the tailoring data is always built with HANGUL_TAG specials. 86 optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); 87 dataBuilder.optimize(optimizeSet); 88 tailoring.ensureOwnedData(); 89 if(fastLatinEnabled) { dataBuilder.enableFastLatin(); } 90 dataBuilder.build(tailoring.ownedData); 91 // C++ tailoring.builder = dataBuilder; 92 dataBuilder = null; 93 } else { 94 tailoring.data = baseData; 95 } 96 ownedSettings.fastLatinOptions = CollationFastLatin.getOptions( 97 tailoring.data, ownedSettings, 98 ownedSettings.fastLatinPrimaries); 99 tailoring.setRules(ruleString); 100 // In Java, we do not have a rules version. 101 // In C++, the genrb build tool reads and supplies one, 102 // and the rulesVersion is a parameter for this method. 103 tailoring.setVersion(base.version, 0 /* rulesVersion */); 104 return tailoring; 105 } 106 107 /** Implements CollationRuleParser.Sink. */ 108 @Override 109 void addReset(int strength, CharSequence str) { 110 assert(str.length() != 0); 111 if(str.charAt(0) == CollationRuleParser.POS_LEAD) { 112 ces[0] = getSpecialResetPosition(str); 113 cesLength = 1; 114 assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0); 115 } else { 116 // normal reset to a character or string 117 String nfdString = nfd.normalize(str); 118 cesLength = dataBuilder.getCEs(nfdString, ces, 0); 119 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 120 throw new IllegalArgumentException( 121 "reset position maps to too many collation elements (more than 31)"); 122 } 123 } 124 if(strength == Collator.IDENTICAL) { return; } // simple reset-at-position 125 126 // &[before strength]position 127 assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY); 128 int index = findOrInsertNodeForCEs(strength); 129 130 long node = nodes.elementAti(index); 131 // If the index is for a "weaker" node, 132 // then skip backwards over this and further "weaker" nodes. 133 while(strengthFromNode(node) > strength) { 134 index = previousIndexFromNode(node); 135 node = nodes.elementAti(index); 136 } 137 138 // Find or insert a node whose index we will put into a temporary CE. 139 if(strengthFromNode(node) == strength && isTailoredNode(node)) { 140 // Reset to just before this same-strength tailored node. 141 index = previousIndexFromNode(node); 142 } else if(strength == Collator.PRIMARY) { 143 // root primary node (has no previous index) 144 long p = weight32FromNode(node); 145 if(p == 0) { 146 throw new UnsupportedOperationException( 147 "reset primary-before ignorable not possible"); 148 } 149 if(p <= rootElements.getFirstPrimary()) { 150 // There is no primary gap between ignorables and the space-first-primary. 151 throw new UnsupportedOperationException( 152 "reset primary-before first non-ignorable not supported"); 153 } 154 if(p == Collation.FIRST_TRAILING_PRIMARY) { 155 // We do not support tailoring to an unassigned-implicit CE. 156 throw new UnsupportedOperationException( 157 "reset primary-before [first trailing] not supported"); 158 } 159 p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p)); 160 index = findOrInsertNodeForPrimary(p); 161 // Go to the last node in this list: 162 // Tailor after the last node between adjacent root nodes. 163 for(;;) { 164 node = nodes.elementAti(index); 165 int nextIndex = nextIndexFromNode(node); 166 if(nextIndex == 0) { break; } 167 index = nextIndex; 168 } 169 } else { 170 // &[before 2] or &[before 3] 171 index = findCommonNode(index, Collator.SECONDARY); 172 if(strength >= Collator.TERTIARY) { 173 index = findCommonNode(index, Collator.TERTIARY); 174 } 175 // findCommonNode() stayed on the stronger node or moved to 176 // an explicit common-weight node of the reset-before strength. 177 node = nodes.elementAti(index); 178 if(strengthFromNode(node) == strength) { 179 // Found a same-strength node with an explicit weight. 180 int weight16 = weight16FromNode(node); 181 if(weight16 == 0) { 182 throw new UnsupportedOperationException( 183 (strength == Collator.SECONDARY) ? 184 "reset secondary-before secondary ignorable not possible" : 185 "reset tertiary-before completely ignorable not possible"); 186 } 187 assert(weight16 > Collation.BEFORE_WEIGHT16); 188 // Reset to just before this node. 189 // Insert the preceding same-level explicit weight if it is not there already. 190 // Which explicit weight immediately precedes this one? 191 weight16 = getWeight16Before(index, node, strength); 192 // Does this preceding weight have a node? 193 int previousWeight16; 194 int previousIndex = previousIndexFromNode(node); 195 for(int i = previousIndex;; i = previousIndexFromNode(node)) { 196 node = nodes.elementAti(i); 197 int previousStrength = strengthFromNode(node); 198 if(previousStrength < strength) { 199 assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex); 200 // Either the reset element has an above-common weight and 201 // the parent node provides the implied common weight, 202 // or the reset element has a weight<=common in the node 203 // right after the parent, and we need to insert the preceding weight. 204 previousWeight16 = Collation.COMMON_WEIGHT16; 205 break; 206 } else if(previousStrength == strength && !isTailoredNode(node)) { 207 previousWeight16 = weight16FromNode(node); 208 break; 209 } 210 // Skip weaker nodes and same-level tailored nodes. 211 } 212 if(previousWeight16 == weight16) { 213 // The preceding weight has a node, 214 // maybe with following weaker or tailored nodes. 215 // Reset to the last of them. 216 index = previousIndex; 217 } else { 218 // Insert a node with the preceding weight, reset to that. 219 node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 220 index = insertNodeBetween(previousIndex, index, node); 221 } 222 } else { 223 // Found a stronger node with implied strength-common weight. 224 int weight16 = getWeight16Before(index, node, strength); 225 index = findOrInsertWeakNode(index, weight16, strength); 226 } 227 // Strength of the temporary CE = strength of its reset position. 228 // Code above raises an error if the before-strength is stronger. 229 strength = ceStrength(ces[cesLength - 1]); 230 } 231 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 232 } 233 234 /** 235 * Returns the secondary or tertiary weight preceding the current node's weight. 236 * node=nodes[index]. 237 */ 238 private int getWeight16Before(int index, long node, int level) { 239 assert(strengthFromNode(node) < level || !isTailoredNode(node)); 240 // Collect the root CE weights if this node is for a root CE. 241 // If it is not, then return the low non-primary boundary for a tailored CE. 242 int t; 243 if(strengthFromNode(node) == Collator.TERTIARY) { 244 t = weight16FromNode(node); 245 } else { 246 t = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. 247 } 248 while(strengthFromNode(node) > Collator.SECONDARY) { 249 index = previousIndexFromNode(node); 250 node = nodes.elementAti(index); 251 } 252 if(isTailoredNode(node)) { 253 return Collation.BEFORE_WEIGHT16; 254 } 255 int s; 256 if(strengthFromNode(node) == Collator.SECONDARY) { 257 s = weight16FromNode(node); 258 } else { 259 s = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. 260 } 261 while(strengthFromNode(node) > Collator.PRIMARY) { 262 index = previousIndexFromNode(node); 263 node = nodes.elementAti(index); 264 } 265 if(isTailoredNode(node)) { 266 return Collation.BEFORE_WEIGHT16; 267 } 268 // [p, s, t] is a root CE. Return the preceding weight for the requested level. 269 long p = weight32FromNode(node); 270 int weight16; 271 if(level == Collator.SECONDARY) { 272 weight16 = rootElements.getSecondaryBefore(p, s); 273 } else { 274 weight16 = rootElements.getTertiaryBefore(p, s, t); 275 assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0); 276 } 277 return weight16; 278 } 279 280 private long getSpecialResetPosition(CharSequence str) { 281 assert(str.length() == 2); 282 long ce; 283 int strength = Collator.PRIMARY; 284 boolean isBoundary = false; 285 CollationRuleParser.Position pos = 286 CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE]; 287 switch(pos) { 288 case FIRST_TERTIARY_IGNORABLE: 289 // Quaternary CEs are not supported. 290 // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 291 return 0; 292 case LAST_TERTIARY_IGNORABLE: 293 return 0; 294 case FIRST_SECONDARY_IGNORABLE: { 295 // Look for a tailored tertiary node after [0, 0, 0]. 296 int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY); 297 long node = nodes.elementAti(index); 298 if((index = nextIndexFromNode(node)) != 0) { 299 node = nodes.elementAti(index); 300 assert(strengthFromNode(node) <= Collator.TERTIARY); 301 if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) { 302 return tempCEFromIndexAndStrength(index, Collator.TERTIARY); 303 } 304 } 305 return rootElements.getFirstTertiaryCE(); 306 // No need to look for nodeHasAnyBefore() on a tertiary node. 307 } 308 case LAST_SECONDARY_IGNORABLE: 309 ce = rootElements.getLastTertiaryCE(); 310 strength = Collator.TERTIARY; 311 break; 312 case FIRST_PRIMARY_IGNORABLE: { 313 // Look for a tailored secondary node after [0, 0, *]. 314 int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY); 315 long node = nodes.elementAti(index); 316 while((index = nextIndexFromNode(node)) != 0) { 317 node = nodes.elementAti(index); 318 strength = strengthFromNode(node); 319 if(strength < Collator.SECONDARY) { break; } 320 if(strength == Collator.SECONDARY) { 321 if(isTailoredNode(node)) { 322 if(nodeHasBefore3(node)) { 323 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 324 assert(isTailoredNode(nodes.elementAti(index))); 325 } 326 return tempCEFromIndexAndStrength(index, Collator.SECONDARY); 327 } else { 328 break; 329 } 330 } 331 } 332 ce = rootElements.getFirstSecondaryCE(); 333 strength = Collator.SECONDARY; 334 break; 335 } 336 case LAST_PRIMARY_IGNORABLE: 337 ce = rootElements.getLastSecondaryCE(); 338 strength = Collator.SECONDARY; 339 break; 340 case FIRST_VARIABLE: 341 ce = rootElements.getFirstPrimaryCE(); 342 isBoundary = true; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 343 break; 344 case LAST_VARIABLE: 345 ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 346 break; 347 case FIRST_REGULAR: 348 ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 349 isBoundary = true; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 350 break; 351 case LAST_REGULAR: 352 // Use the Hani-first-primary rather than the actual last "regular" CE before it, 353 // for backward compatibility with behavior before the introduction of 354 // script-first-primary CEs in the root collator. 355 ce = rootElements.firstCEWithPrimaryAtLeast( 356 baseData.getFirstPrimaryForGroup(UScript.HAN)); 357 break; 358 case FIRST_IMPLICIT: 359 ce = baseData.getSingleCE(0x4e00); 360 break; 361 case LAST_IMPLICIT: 362 // We do not support tailoring to an unassigned-implicit CE. 363 throw new UnsupportedOperationException( 364 "reset to [last implicit] not supported"); 365 case FIRST_TRAILING: 366 ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY); 367 isBoundary = true; // trailing first primary (there is no mapping for it) 368 break; 369 case LAST_TRAILING: 370 throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF"); 371 default: 372 assert(false); 373 return 0; 374 } 375 376 int index = findOrInsertNodeForRootCE(ce, strength); 377 long node = nodes.elementAti(index); 378 if((pos.ordinal() & 1) == 0) { 379 // even pos = [first xyz] 380 if(!nodeHasAnyBefore(node) && isBoundary) { 381 // A <group> first primary boundary is artificially added to FractionalUCA.txt. 382 // It is reachable via its special contraction, but is not normally used. 383 // Find the first character tailored after the boundary CE, 384 // or the first real root CE after it. 385 if((index = nextIndexFromNode(node)) != 0) { 386 // If there is a following node, then it must be tailored 387 // because there are no root CEs with a boundary primary 388 // and non-common secondary/tertiary weights. 389 node = nodes.elementAti(index); 390 assert(isTailoredNode(node)); 391 ce = tempCEFromIndexAndStrength(index, strength); 392 } else { 393 assert(strength == Collator.PRIMARY); 394 long p = ce >>> 32; 395 int pIndex = rootElements.findPrimary(p); 396 boolean isCompressible = baseData.isCompressiblePrimary(p); 397 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 398 ce = Collation.makeCE(p); 399 index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY); 400 node = nodes.elementAti(index); 401 } 402 } 403 if(nodeHasAnyBefore(node)) { 404 // Get the first node that was tailored before this one at a weaker strength. 405 if(nodeHasBefore2(node)) { 406 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 407 node = nodes.elementAti(index); 408 } 409 if(nodeHasBefore3(node)) { 410 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 411 } 412 assert(isTailoredNode(nodes.elementAti(index))); 413 ce = tempCEFromIndexAndStrength(index, strength); 414 } 415 } else { 416 // odd pos = [last xyz] 417 // Find the last node that was tailored after the [last xyz] 418 // at a strength no greater than the position's strength. 419 for(;;) { 420 int nextIndex = nextIndexFromNode(node); 421 if(nextIndex == 0) { break; } 422 long nextNode = nodes.elementAti(nextIndex); 423 if(strengthFromNode(nextNode) < strength) { break; } 424 index = nextIndex; 425 node = nextNode; 426 } 427 // Do not make a temporary CE for a root node. 428 // This last node might be the node for the root CE itself, 429 // or a node with a common secondary or tertiary weight. 430 if(isTailoredNode(node)) { 431 ce = tempCEFromIndexAndStrength(index, strength); 432 } 433 } 434 return ce; 435 } 436 437 /** Implements CollationRuleParser.Sink. */ 438 // Java 6: @Override 439 void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) { 440 String nfdPrefix; 441 if(prefix.length() == 0) { 442 nfdPrefix = ""; 443 } else { 444 nfdPrefix = nfd.normalize(prefix); 445 } 446 String nfdString = nfd.normalize(str); 447 448 // The runtime code decomposes Hangul syllables on the fly, 449 // with recursive processing but without making the Jamo pieces visible for matching. 450 // It does not work with certain types of contextual mappings. 451 int nfdLength = nfdString.length(); 452 if(nfdLength >= 2) { 453 char c = nfdString.charAt(0); 454 if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) { 455 // While handling a Hangul syllable, contractions starting with Jamo L or V 456 // would not see the following Jamo of that syllable. 457 throw new UnsupportedOperationException( 458 "contractions starting with conjoining Jamo L or V not supported"); 459 } 460 c = nfdString.charAt(nfdLength - 1); 461 if(Hangul.isJamoL(c) || 462 (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) { 463 // A contraction ending with Jamo L or L+V would require 464 // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 465 // or decomposing a following Hangul syllable on the fly, during contraction matching. 466 throw new UnsupportedOperationException( 467 "contractions ending with conjoining Jamo L or L+V not supported"); 468 } 469 // A Hangul syllable completely inside a contraction is ok. 470 } 471 // Note: If there is a prefix, then the parser checked that 472 // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 473 // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 474 // (While handling a Hangul syllable, prefixes on Jamo V or T 475 // would not see the previous Jamo of that syllable.) 476 477 if(strength != Collator.IDENTICAL) { 478 // Find the node index after which we insert the new tailored node. 479 int index = findOrInsertNodeForCEs(strength); 480 assert(cesLength > 0); 481 long ce = ces[cesLength - 1]; 482 if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) { 483 // There is no primary gap between ignorables and the space-first-primary. 484 throw new UnsupportedOperationException( 485 "tailoring primary after ignorables not supported"); 486 } 487 if(strength == Collator.QUATERNARY && ce == 0) { 488 // The CE data structure does not support non-zero quaternary weights 489 // on tertiary ignorables. 490 throw new UnsupportedOperationException( 491 "tailoring quaternary after tertiary ignorables not supported"); 492 } 493 // Insert the new tailored node. 494 index = insertTailoredNodeAfter(index, strength); 495 // Strength of the temporary CE: 496 // The new relation may yield a stronger CE but not a weaker one. 497 int tempStrength = ceStrength(ce); 498 if(strength < tempStrength) { tempStrength = strength; } 499 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 500 } 501 502 setCaseBits(nfdString); 503 504 int cesLengthBeforeExtension = cesLength; 505 if(extension.length() != 0) { 506 String nfdExtension = nfd.normalize(extension); 507 cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength); 508 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 509 throw new IllegalArgumentException( 510 "extension string adds too many collation elements (more than 31 total)"); 511 } 512 } 513 int ce32 = Collation.UNASSIGNED_CE32; 514 if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) && 515 !ignorePrefix(prefix) && !ignoreString(str)) { 516 // Map from the original input to the CEs. 517 // We do this in case the canonical closure is incomplete, 518 // so that it is possible to explicitly provide the missing mappings. 519 ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32); 520 } 521 addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32); 522 cesLength = cesLengthBeforeExtension; 523 } 524 525 /** 526 * Picks one of the current CEs and finds or inserts a node in the graph 527 * for the CE + strength. 528 */ 529 private int findOrInsertNodeForCEs(int strength) { 530 assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY); 531 532 // Find the last CE that is at least as "strong" as the requested difference. 533 // Note: Stronger is smaller (Collator.PRIMARY=0). 534 long ce; 535 for(;; --cesLength) { 536 if(cesLength == 0) { 537 ce = ces[0] = 0; 538 cesLength = 1; 539 break; 540 } else { 541 ce = ces[cesLength - 1]; 542 } 543 if(ceStrength(ce) <= strength) { break; } 544 } 545 546 if(isTempCE(ce)) { 547 // No need to findCommonNode() here for lower levels 548 // because insertTailoredNodeAfter() will do that anyway. 549 return indexFromTempCE(ce); 550 } 551 552 // root CE 553 if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) { 554 throw new UnsupportedOperationException( 555 "tailoring relative to an unassigned code point not supported"); 556 } 557 return findOrInsertNodeForRootCE(ce, strength); 558 } 559 560 private int findOrInsertNodeForRootCE(long ce, int strength) { 561 assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE); 562 563 // Find or insert the node for each of the root CE's weights, 564 // down to the requested level/strength. 565 // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 566 assert((ce & 0xc0) == 0); 567 int index = findOrInsertNodeForPrimary(ce >>> 32); 568 if(strength >= Collator.SECONDARY) { 569 int lower32 = (int)ce; 570 index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY); 571 if(strength >= Collator.TERTIARY) { 572 index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK, 573 Collator.TERTIARY); 574 } 575 } 576 return index; 577 } 578 579 /** 580 * Like Java Collections.binarySearch(List, key, Comparator). 581 * 582 * @return the index>=0 where the item was found, 583 * or the index<0 for inserting the string at ~index in sorted order 584 * (index into rootPrimaryIndexes) 585 */ 586 private static final int binarySearchForRootPrimaryNode( 587 int[] rootPrimaryIndexes, int length, long[] nodes, long p) { 588 if(length == 0) { return ~0; } 589 int start = 0; 590 int limit = length; 591 for (;;) { 592 int i = (start + limit) / 2; 593 long node = nodes[rootPrimaryIndexes[i]]; 594 long nodePrimary = node >>> 32; // weight32FromNode(node) 595 if (p == nodePrimary) { 596 return i; 597 } else if (p < nodePrimary) { 598 if (i == start) { 599 return ~start; // insert s before i 600 } 601 limit = i; 602 } else { 603 if (i == start) { 604 return ~(start + 1); // insert s after i 605 } 606 start = i; 607 } 608 } 609 } 610 611 /** Finds or inserts the node for a root CE's primary weight. */ 612 private int findOrInsertNodeForPrimary(long p) { 613 int rootIndex = binarySearchForRootPrimaryNode( 614 rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 615 if(rootIndex >= 0) { 616 return rootPrimaryIndexes.elementAti(rootIndex); 617 } else { 618 // Start a new list of nodes with this primary. 619 int index = nodes.size(); 620 nodes.addElement(nodeFromWeight32(p)); 621 rootPrimaryIndexes.insertElementAt(index, ~rootIndex); 622 return index; 623 } 624 } 625 626 /** Finds or inserts the node for a secondary or tertiary weight. */ 627 private int findOrInsertWeakNode(int index, int weight16, int level) { 628 assert(0 <= index && index < nodes.size()); 629 assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY); 630 631 if(weight16 == Collation.COMMON_WEIGHT16) { 632 return findCommonNode(index, level); 633 } 634 635 // If this will be the first below-common weight for the parent node, 636 // then we will also need to insert a common weight after it. 637 long node = nodes.elementAti(index); 638 assert(strengthFromNode(node) < level); // parent node is stronger 639 if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) { 640 int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3; 641 if((node & hasThisLevelBefore) == 0) { 642 // The parent node has an implied level-common weight. 643 long commonNode = 644 nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level); 645 if(level == Collator.SECONDARY) { 646 // Move the HAS_BEFORE3 flag from the parent node 647 // to the new secondary common node. 648 commonNode |= node & HAS_BEFORE3; 649 node &= ~(long)HAS_BEFORE3; 650 } 651 nodes.setElementAt(node | hasThisLevelBefore, index); 652 // Insert below-common-weight node. 653 int nextIndex = nextIndexFromNode(node); 654 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 655 index = insertNodeBetween(index, nextIndex, node); 656 // Insert common-weight node. 657 insertNodeBetween(index, nextIndex, commonNode); 658 // Return index of below-common-weight node. 659 return index; 660 } 661 } 662 663 // Find the root CE's weight for this level. 664 // Postpone insertion if not found: 665 // Insert the new root node before the next stronger node, 666 // or before the next root node with the same strength and a larger weight. 667 int nextIndex; 668 while((nextIndex = nextIndexFromNode(node)) != 0) { 669 node = nodes.elementAti(nextIndex); 670 int nextStrength = strengthFromNode(node); 671 if(nextStrength <= level) { 672 // Insert before a stronger node. 673 if(nextStrength < level) { break; } 674 // nextStrength == level 675 if(!isTailoredNode(node)) { 676 int nextWeight16 = weight16FromNode(node); 677 if(nextWeight16 == weight16) { 678 // Found the node for the root CE up to this level. 679 return nextIndex; 680 } 681 // Insert before a node with a larger same-strength weight. 682 if(nextWeight16 > weight16) { break; } 683 } 684 } 685 // Skip the next node. 686 index = nextIndex; 687 } 688 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 689 return insertNodeBetween(index, nextIndex, node); 690 } 691 692 /** 693 * Makes and inserts a new tailored node into the list, after the one at index. 694 * Skips over nodes of weaker strength to maintain collation order 695 * ("postpone insertion"). 696 * @return the new node's index 697 */ 698 private int insertTailoredNodeAfter(int index, int strength) { 699 assert(0 <= index && index < nodes.size()); 700 if(strength >= Collator.SECONDARY) { 701 index = findCommonNode(index, Collator.SECONDARY); 702 if(strength >= Collator.TERTIARY) { 703 index = findCommonNode(index, Collator.TERTIARY); 704 } 705 } 706 // Postpone insertion: 707 // Insert the new node before the next one with a strength at least as strong. 708 long node = nodes.elementAti(index); 709 int nextIndex; 710 while((nextIndex = nextIndexFromNode(node)) != 0) { 711 node = nodes.elementAti(nextIndex); 712 if(strengthFromNode(node) <= strength) { break; } 713 // Skip the next node which has a weaker (larger) strength than the new one. 714 index = nextIndex; 715 } 716 node = IS_TAILORED | nodeFromStrength(strength); 717 return insertNodeBetween(index, nextIndex, node); 718 } 719 720 /** 721 * Inserts a new node into the list, between list-adjacent items. 722 * The node's previous and next indexes must not be set yet. 723 * @return the new node's index 724 */ 725 private int insertNodeBetween(int index, int nextIndex, long node) { 726 assert(previousIndexFromNode(node) == 0); 727 assert(nextIndexFromNode(node) == 0); 728 assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 729 // Append the new node and link it to the existing nodes. 730 int newIndex = nodes.size(); 731 node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 732 nodes.addElement(node); 733 // nodes[index].nextIndex = newIndex 734 node = nodes.elementAti(index); 735 nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 736 // nodes[nextIndex].previousIndex = newIndex 737 if(nextIndex != 0) { 738 node = nodes.elementAti(nextIndex); 739 nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 740 } 741 return newIndex; 742 } 743 744 /** 745 * Finds the node which implies or contains a common=05 weight of the given strength 746 * (secondary or tertiary), if the current node is stronger. 747 * Skips weaker nodes and tailored nodes if the current node is stronger 748 * and is followed by an explicit-common-weight node. 749 * Always returns the input index if that node is no stronger than the given strength. 750 */ 751 private int findCommonNode(int index, int strength) { 752 assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY); 753 long node = nodes.elementAti(index); 754 if(strengthFromNode(node) >= strength) { 755 // The current node is no stronger. 756 return index; 757 } 758 if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 759 // The current node implies the strength-common weight. 760 return index; 761 } 762 index = nextIndexFromNode(node); 763 node = nodes.elementAti(index); 764 assert(!isTailoredNode(node) && strengthFromNode(node) == strength && 765 weight16FromNode(node) < Collation.COMMON_WEIGHT16); 766 // Skip to the explicit common node. 767 do { 768 index = nextIndexFromNode(node); 769 node = nodes.elementAti(index); 770 assert(strengthFromNode(node) >= strength); 771 } while(isTailoredNode(node) || strengthFromNode(node) > strength || 772 weight16FromNode(node) < Collation.COMMON_WEIGHT16); 773 assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16); 774 return index; 775 } 776 777 private void setCaseBits(CharSequence nfdString) { 778 int numTailoredPrimaries = 0; 779 for(int i = 0; i < cesLength; ++i) { 780 if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; } 781 } 782 // We should not be able to get too many case bits because 783 // cesLength<=31==MAX_EXPANSION_LENGTH. 784 // 31 pairs of case bits fit into an long without setting its sign bit. 785 assert(numTailoredPrimaries <= 31); 786 787 long cases = 0; 788 if(numTailoredPrimaries > 0) { 789 CharSequence s = nfdString; 790 UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0); 791 int baseCEsLength = baseCEs.fetchCEs() - 1; 792 assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE); 793 794 int lastCase = 0; 795 int numBasePrimaries = 0; 796 for(int i = 0; i < baseCEsLength; ++i) { 797 long ce = baseCEs.getCE(i); 798 if((ce >>> 32) != 0) { 799 ++numBasePrimaries; 800 int c = ((int)ce >> 14) & 3; 801 assert(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 802 if(numBasePrimaries < numTailoredPrimaries) { 803 cases |= (long)c << ((numBasePrimaries - 1) * 2); 804 } else if(numBasePrimaries == numTailoredPrimaries) { 805 lastCase = c; 806 } else if(c != lastCase) { 807 // There are more base primary CEs than tailored primaries. 808 // Set mixed case if the case bits of the remainder differ. 809 lastCase = 1; 810 // Nothing more can change. 811 break; 812 } 813 } 814 } 815 if(numBasePrimaries >= numTailoredPrimaries) { 816 cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2); 817 } 818 } 819 820 for(int i = 0; i < cesLength; ++i) { 821 long ce = ces[i] & 0xffffffffffff3fffL; // clear old case bits 822 int strength = ceStrength(ce); 823 if(strength == Collator.PRIMARY) { 824 ce |= (cases & 3) << 14; 825 cases >>>= 2; 826 } else if(strength == Collator.TERTIARY) { 827 // Tertiary CEs must have uppercase bits. 828 // See the LDML spec, and comments in class CollationCompare. 829 ce |= 0x8000; 830 } 831 // Tertiary ignorable CEs must have 0 case bits. 832 // We set 0 case bits for secondary CEs too 833 // since currently only U+0345 is cased and maps to a secondary CE, 834 // and it is lowercase. Other secondaries are uncased. 835 // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 836 ces[i] = ce; 837 } 838 } 839 840 /** Implements CollationRuleParser.Sink. */ 841 @Override 842 void suppressContractions(UnicodeSet set) { 843 dataBuilder.suppressContractions(set); 844 } 845 846 /** Implements CollationRuleParser.Sink. */ 847 @Override 848 void optimize(UnicodeSet set) { 849 optimizeSet.addAll(set); 850 } 851 852 /** 853 * Adds the mapping and its canonical closure. 854 * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder 855 * need not re-encode the CEs multiple times. 856 */ 857 private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString, 858 long[] newCEs, int newCEsLength, int ce32) { 859 // Map from the NFD input to the CEs. 860 ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); 861 ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); 862 addTailComposites(nfdPrefix, nfdString); 863 return ce32; 864 } 865 866 private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString, 867 long[] newCEs, int newCEsLength, int ce32) { 868 // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 869 // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String 870 if(nfdPrefix.length() == 0) { 871 CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); 872 String prefix = ""; 873 for(;;) { 874 String str = stringIter.next(); 875 if(str == null) { break; } 876 if(ignoreString(str) || str.contentEquals(nfdString)) { continue; } 877 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); 878 } 879 } else { 880 CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString()); 881 CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); 882 for(;;) { 883 String prefix = prefixIter.next(); 884 if(prefix == null) { break; } 885 if(ignorePrefix(prefix)) { continue; } 886 boolean samePrefix = prefix.contentEquals(nfdPrefix); 887 for(;;) { 888 String str = stringIter.next(); 889 if(str == null) { break; } 890 if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; } 891 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); 892 } 893 stringIter.reset(); 894 } 895 } 896 return ce32; 897 } 898 899 private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) { 900 // Look for the last starter in the NFD string. 901 int lastStarter; 902 int indexAfterLastStarter = nfdString.length(); 903 for(;;) { 904 if(indexAfterLastStarter == 0) { return; } // no starter at all 905 lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter); 906 if(nfd.getCombiningClass(lastStarter) == 0) { break; } 907 indexAfterLastStarter -= Character.charCount(lastStarter); 908 } 909 // No closure to Hangul syllables since we decompose them on the fly. 910 if(Hangul.isJamoL(lastStarter)) { return; } 911 912 // Are there any composites whose decomposition starts with the lastStarter? 913 // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 914 // We might find some more equivalent mappings here if it did. 915 UnicodeSet composites = new UnicodeSet(); 916 if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 917 918 StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder(); 919 long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 920 UnicodeSetIterator iter = new UnicodeSetIterator(composites); 921 while(iter.next()) { 922 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 923 int composite = iter.codepoint; 924 String decomp = nfd.getDecomposition(composite); 925 if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 926 newNFDString, newString)) { 927 continue; 928 } 929 int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0); 930 if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) { 931 // Ignore mappings that we cannot store. 932 continue; 933 } 934 // Note: It is possible that the newCEs do not make use of the mapping 935 // for which we are adding the tail composites, in which case we might be adding 936 // unnecessary mappings. 937 // For example, when we add tail composites for ae^ (^=combining circumflex), 938 // UCA discontiguous-contraction matching does not find any matches 939 // for ae_^ (_=any combining diacritic below) *unless* there is also 940 // a contraction mapping for ae. 941 // Thus, if there is no ae contraction, then the ae^ mapping is ignored 942 // while fetching the newCEs for ae_^. 943 // TODO: Try to detect this effectively. 944 // (Alternatively, print a warning when prefix contractions are missing.) 945 946 // We do not need an explicit mapping for the NFD strings. 947 // It is fine if the NFD input collates like this via a sequence of mappings. 948 // It also saves a little bit of space, and may reduce the set of characters with contractions. 949 int ce32 = addIfDifferent(nfdPrefix, newString, 950 newCEs, newCEsLength, Collation.UNASSIGNED_CE32); 951 if(ce32 != Collation.UNASSIGNED_CE32) { 952 // was different, was added 953 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32); 954 } 955 } 956 } 957 958 private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter, 959 int composite, CharSequence decomp, 960 StringBuilder newNFDString, StringBuilder newString) { 961 assert(Character.codePointBefore(nfdString, indexAfterLastStarter) == 962 Character.codePointAt(decomp, 0)); 963 int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1); 964 if(lastStarterLength == decomp.length()) { 965 // Singleton decompositions should be found by addWithClosure() 966 // and the CanonicalIterator, so we can ignore them here. 967 return false; 968 } 969 if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) { 970 // same strings, nothing new to be found here 971 return false; 972 } 973 974 // Make new FCD strings that combine a composite, or its decomposition, 975 // into the nfdString's last starter and the combining marks following it. 976 // Make an NFD version, and a version with the composite. 977 newNFDString.setLength(0); 978 newNFDString.append(nfdString, 0, indexAfterLastStarter); 979 newString.setLength(0); 980 newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength) 981 .appendCodePoint(composite); 982 983 // The following is related to discontiguous contraction matching, 984 // but builds only FCD strings (or else returns false). 985 int sourceIndex = indexAfterLastStarter; 986 int decompIndex = lastStarterLength; 987 // Small optimization: We keep the source character across loop iterations 988 // because we do not always consume it, 989 // and then need not fetch it again nor look up its combining class again. 990 int sourceChar = Collation.SENTINEL_CP; 991 // The cc variables need to be declared before the loop so that at the end 992 // they are set to the last combining classes seen. 993 int sourceCC = 0; 994 int decompCC = 0; 995 for(;;) { 996 if(sourceChar < 0) { 997 if(sourceIndex >= nfdString.length()) { break; } 998 sourceChar = Character.codePointAt(nfdString, sourceIndex); 999 sourceCC = nfd.getCombiningClass(sourceChar); 1000 assert(sourceCC != 0); 1001 } 1002 // We consume a decomposition character in each iteration. 1003 if(decompIndex >= decomp.length()) { break; } 1004 int decompChar = Character.codePointAt(decomp, decompIndex); 1005 decompCC = nfd.getCombiningClass(decompChar); 1006 // Compare the two characters and their combining classes. 1007 if(decompCC == 0) { 1008 // Unable to merge because the source contains a non-zero combining mark 1009 // but the composite's decomposition contains another starter. 1010 // The strings would not be equivalent. 1011 return false; 1012 } else if(sourceCC < decompCC) { 1013 // Composite + sourceChar would not be FCD. 1014 return false; 1015 } else if(decompCC < sourceCC) { 1016 newNFDString.appendCodePoint(decompChar); 1017 decompIndex += Character.charCount(decompChar); 1018 } else if(decompChar != sourceChar) { 1019 // Blocked because same combining class. 1020 return false; 1021 } else { // match: decompChar == sourceChar 1022 newNFDString.appendCodePoint(decompChar); 1023 decompIndex += Character.charCount(decompChar); 1024 sourceIndex += Character.charCount(decompChar); 1025 sourceChar = Collation.SENTINEL_CP; 1026 } 1027 } 1028 // We are at the end of at least one of the two inputs. 1029 if(sourceChar >= 0) { // more characters from nfdString but not from decomp 1030 if(sourceCC < decompCC) { 1031 // Appending the next source character to the composite would not be FCD. 1032 return false; 1033 } 1034 newNFDString.append(nfdString, sourceIndex, nfdString.length()); 1035 newString.append(nfdString, sourceIndex, nfdString.length()); 1036 } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 1037 newNFDString.append(decomp, decompIndex, decomp.length()); 1038 } 1039 assert(nfd.isNormalized(newNFDString)); 1040 assert(fcd.isNormalized(newString)); 1041 assert(nfd.normalize(newString).equals(newNFDString.toString())); // canonically equivalent 1042 return true; 1043 } 1044 1045 private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) { 1046 // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0 1047 int leftLength = left.length(); 1048 if((leftLength - leftStart) != (right.length() - rightStart)) { return false; } 1049 while(leftStart < leftLength) { 1050 if(left.charAt(leftStart++) != right.charAt(rightStart++)) { 1051 return false; 1052 } 1053 } 1054 return true; 1055 } 1056 private boolean ignorePrefix(CharSequence s) { 1057 // Do not map non-FCD prefixes. 1058 return !isFCD(s); 1059 } 1060 private boolean ignoreString(CharSequence s) { 1061 // Do not map non-FCD strings. 1062 // Do not map strings that start with Hangul syllables: We decompose those on the fly. 1063 return !isFCD(s) || Hangul.isHangul(s.charAt(0)); 1064 } 1065 private boolean isFCD(CharSequence s) { 1066 return fcd.isNormalized(s); 1067 } 1068 1069 private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]"); 1070 static { 1071 // Hangul is decomposed on the fly during collation. 1072 COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); 1073 } 1074 1075 private void closeOverComposites() { 1076 String prefix = ""; // empty 1077 UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES); 1078 while(iter.next()) { 1079 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 1080 String nfdString = nfd.getDecomposition(iter.codepoint); 1081 cesLength = dataBuilder.getCEs(nfdString, ces, 0); 1082 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 1083 // Too many CEs from the decomposition (unusual), ignore this composite. 1084 // We could add a capacity parameter to getCEs() and reallocate if necessary. 1085 // However, this can only really happen in contrived cases. 1086 continue; 1087 } 1088 String composite = iter.getString(); 1089 addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32); 1090 } 1091 } 1092 1093 private int addIfDifferent(CharSequence prefix, CharSequence str, 1094 long[] newCEs, int newCEsLength, int ce32) { 1095 long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 1096 int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0); 1097 if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 1098 if(ce32 == Collation.UNASSIGNED_CE32) { 1099 ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength); 1100 } 1101 dataBuilder.addCE32(prefix, str, ce32); 1102 } 1103 return ce32; 1104 } 1105 1106 private static boolean sameCEs(long[] ces1, int ces1Length, 1107 long[] ces2, int ces2Length) { 1108 if(ces1Length != ces2Length) { 1109 return false; 1110 } 1111 assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH); 1112 for(int i = 0; i < ces1Length; ++i) { 1113 if(ces1[i] != ces2[i]) { return false; } 1114 } 1115 return true; 1116 } 1117 1118 private static final int alignWeightRight(int w) { 1119 if(w != 0) { 1120 while((w & 0xff) == 0) { w >>>= 8; } 1121 } 1122 return w; 1123 } 1124 1125 /** 1126 * Walks the tailoring graph and overwrites tailored nodes with new CEs. 1127 * After this, the graph is destroyed. 1128 * The nodes array can then be used only as a source of tailored CEs. 1129 */ 1130 private void makeTailoredCEs() { 1131 CollationWeights primaries = new CollationWeights(); 1132 CollationWeights secondaries = new CollationWeights(); 1133 CollationWeights tertiaries = new CollationWeights(); 1134 long[] nodesArray = nodes.getBuffer(); 1135 if(DEBUG) { 1136 System.out.println("\nCollationBuilder.makeTailoredCEs()"); 1137 } 1138 1139 for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 1140 int i = rootPrimaryIndexes.elementAti(rpi); 1141 long node = nodesArray[i]; 1142 long p = weight32FromNode(node); 1143 int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16; 1144 int t = s; 1145 int q = 0; 1146 boolean pIsTailored = false; 1147 boolean sIsTailored = false; 1148 boolean tIsTailored = false; 1149 if(DEBUG) { 1150 System.out.printf("\nprimary %x\n", alignWeightRight((int)p)); 1151 } 1152 int pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 1153 int nextIndex = nextIndexFromNode(node); 1154 while(nextIndex != 0) { 1155 i = nextIndex; 1156 node = nodesArray[i]; 1157 nextIndex = nextIndexFromNode(node); 1158 int strength = strengthFromNode(node); 1159 if(strength == Collator.QUATERNARY) { 1160 assert(isTailoredNode(node)); 1161 if(DEBUG) { 1162 System.out.print(" quat+ "); 1163 } 1164 if(q == 3) { 1165 // C++ U_BUFFER_OVERFLOW_ERROR 1166 throw new UnsupportedOperationException("quaternary tailoring gap too small"); 1167 } 1168 ++q; 1169 } else { 1170 if(strength == Collator.TERTIARY) { 1171 if(isTailoredNode(node)) { 1172 if(DEBUG) { 1173 System.out.print(" ter+ "); 1174 } 1175 if(!tIsTailored) { 1176 // First tailored tertiary node for [p, s]. 1177 int tCount = countTailoredNodes(nodesArray, nextIndex, 1178 Collator.TERTIARY) + 1; 1179 int tLimit; 1180 if(t == 0) { 1181 // Gap at the beginning of the tertiary CE range. 1182 t = rootElements.getTertiaryBoundary() - 0x100; 1183 tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK; 1184 } else if(!pIsTailored && !sIsTailored) { 1185 // p and s are root weights. 1186 tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 1187 } else if(t == Collation.BEFORE_WEIGHT16) { 1188 tLimit = Collation.COMMON_WEIGHT16; 1189 } else { 1190 // [p, s] is tailored. 1191 assert(t == Collation.COMMON_WEIGHT16); 1192 tLimit = rootElements.getTertiaryBoundary(); 1193 } 1194 assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0); 1195 tertiaries.initForTertiary(); 1196 if(!tertiaries.allocWeights(t, tLimit, tCount)) { 1197 // C++ U_BUFFER_OVERFLOW_ERROR 1198 throw new UnsupportedOperationException("tertiary tailoring gap too small"); 1199 } 1200 tIsTailored = true; 1201 } 1202 t = (int)tertiaries.nextWeight(); 1203 assert(t != 0xffffffff); 1204 } else { 1205 t = weight16FromNode(node); 1206 tIsTailored = false; 1207 if(DEBUG) { 1208 System.out.printf(" ter %x\n", alignWeightRight(t)); 1209 } 1210 } 1211 } else { 1212 if(strength == Collator.SECONDARY) { 1213 if(isTailoredNode(node)) { 1214 if(DEBUG) { 1215 System.out.print(" sec+ "); 1216 } 1217 if(!sIsTailored) { 1218 // First tailored secondary node for p. 1219 int sCount = countTailoredNodes(nodesArray, nextIndex, 1220 Collator.SECONDARY) + 1; 1221 int sLimit; 1222 if(s == 0) { 1223 // Gap at the beginning of the secondary CE range. 1224 s = rootElements.getSecondaryBoundary() - 0x100; 1225 sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16); 1226 } else if(!pIsTailored) { 1227 // p is a root primary. 1228 sLimit = rootElements.getSecondaryAfter(pIndex, s); 1229 } else if(s == Collation.BEFORE_WEIGHT16) { 1230 sLimit = Collation.COMMON_WEIGHT16; 1231 } else { 1232 // p is a tailored primary. 1233 assert(s == Collation.COMMON_WEIGHT16); 1234 sLimit = rootElements.getSecondaryBoundary(); 1235 } 1236 if(s == Collation.COMMON_WEIGHT16) { 1237 // Do not tailor into the getSortKey() range of 1238 // compressed common secondaries. 1239 s = rootElements.getLastCommonSecondary(); 1240 } 1241 secondaries.initForSecondary(); 1242 if(!secondaries.allocWeights(s, sLimit, sCount)) { 1243 // C++ U_BUFFER_OVERFLOW_ERROR 1244 if(DEBUG) { 1245 System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n", 1246 alignWeightRight(s), alignWeightRight(sLimit), 1247 alignWeightRight(sCount)); 1248 } 1249 throw new UnsupportedOperationException("secondary tailoring gap too small"); 1250 } 1251 sIsTailored = true; 1252 } 1253 s = (int)secondaries.nextWeight(); 1254 assert(s != 0xffffffff); 1255 } else { 1256 s = weight16FromNode(node); 1257 sIsTailored = false; 1258 if(DEBUG) { 1259 System.out.printf(" sec %x\n", alignWeightRight(s)); 1260 } 1261 } 1262 } else /* Collator.PRIMARY */ { 1263 assert(isTailoredNode(node)); 1264 if(DEBUG) { 1265 System.out.print("pri+ "); 1266 } 1267 if(!pIsTailored) { 1268 // First tailored primary node in this list. 1269 int pCount = countTailoredNodes(nodesArray, nextIndex, 1270 Collator.PRIMARY) + 1; 1271 boolean isCompressible = baseData.isCompressiblePrimary(p); 1272 long pLimit = 1273 rootElements.getPrimaryAfter(p, pIndex, isCompressible); 1274 primaries.initForPrimary(isCompressible); 1275 if(!primaries.allocWeights(p, pLimit, pCount)) { 1276 // C++ U_BUFFER_OVERFLOW_ERROR // TODO: introduce a more specific UErrorCode? 1277 throw new UnsupportedOperationException("primary tailoring gap too small"); 1278 } 1279 pIsTailored = true; 1280 } 1281 p = primaries.nextWeight(); 1282 assert(p != 0xffffffffL); 1283 s = Collation.COMMON_WEIGHT16; 1284 sIsTailored = false; 1285 } 1286 t = s == 0 ? 0 : Collation.COMMON_WEIGHT16; 1287 tIsTailored = false; 1288 } 1289 q = 0; 1290 } 1291 if(isTailoredNode(node)) { 1292 nodesArray[i] = Collation.makeCE(p, s, t, q); 1293 if(DEBUG) { 1294 System.out.printf("%016x\n", nodesArray[i]); 1295 } 1296 } 1297 } 1298 } 1299 } 1300 1301 /** 1302 * Counts the tailored nodes of the given strength up to the next node 1303 * which is either stronger or has an explicit weight of this strength. 1304 */ 1305 private static int countTailoredNodes(long[] nodesArray, int i, int strength) { 1306 int count = 0; 1307 for(;;) { 1308 if(i == 0) { break; } 1309 long node = nodesArray[i]; 1310 if(strengthFromNode(node) < strength) { break; } 1311 if(strengthFromNode(node) == strength) { 1312 if(isTailoredNode(node)) { 1313 ++count; 1314 } else { 1315 break; 1316 } 1317 } 1318 i = nextIndexFromNode(node); 1319 } 1320 return count; 1321 } 1322 1323 private static final class CEFinalizer implements CollationDataBuilder.CEModifier { 1324 CEFinalizer(long[] ces) { 1325 finalCEs = ces; 1326 } 1327 public long modifyCE32(int ce32) { 1328 assert(!Collation.isSpecialCE32(ce32)); 1329 if(CollationBuilder.isTempCE32(ce32)) { 1330 // retain case bits 1331 return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 1332 } else { 1333 return Collation.NO_CE; 1334 } 1335 } 1336 public long modifyCE(long ce) { 1337 if(CollationBuilder.isTempCE(ce)) { 1338 // retain case bits 1339 return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000); 1340 } else { 1341 return Collation.NO_CE; 1342 } 1343 } 1344 1345 private long[] finalCEs; 1346 }; 1347 1348 /** Replaces temporary CEs with the final CEs they point to. */ 1349 private void finalizeCEs() { 1350 CollationDataBuilder newBuilder = new CollationDataBuilder(); 1351 newBuilder.initForTailoring(baseData); 1352 CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer()); 1353 newBuilder.copyFrom(dataBuilder, finalizer); 1354 dataBuilder = newBuilder; 1355 } 1356 1357 /** 1358 * Encodes "temporary CE" data into a CE that fits into the CE32 data structure, 1359 * with 2-byte primary, 1-byte secondary and 6-bit tertiary, 1360 * with valid CE byte values. 1361 * 1362 * The index must not exceed 20 bits (0xfffff). 1363 * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY). 1364 * 1365 * Temporary CEs are distinguished from real CEs by their use of 1366 * secondary weights 06..45 which are otherwise reserved for compressed sort keys. 1367 * 1368 * The case bits are unused and available. 1369 */ 1370 private static long tempCEFromIndexAndStrength(int index, int strength) { 1371 return 1372 // CE byte offsets, to ensure valid CE bytes, and case bits 11 1373 0x4040000006002000L + 1374 // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF) 1375 ((long)(index & 0xfe000) << 43) + 1376 // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF) 1377 ((long)(index & 0x1fc0) << 42) + 1378 // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45) 1379 ((index & 0x3f) << 24) + 1380 // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23) 1381 (strength << 8); 1382 } 1383 private static int indexFromTempCE(long tempCE) { 1384 tempCE -= 0x4040000006002000L; 1385 return 1386 ((int)(tempCE >> 43) & 0xfe000) | 1387 ((int)(tempCE >> 42) & 0x1fc0) | 1388 ((int)(tempCE >> 24) & 0x3f); 1389 } 1390 private static int strengthFromTempCE(long tempCE) { 1391 return ((int)tempCE >> 8) & 3; 1392 } 1393 private static boolean isTempCE(long ce) { 1394 int sec = (int)ce >>> 24; 1395 return 6 <= sec && sec <= 0x45; 1396 } 1397 1398 private static int indexFromTempCE32(int tempCE32) { 1399 tempCE32 -= 0x40400620; 1400 return 1401 ((tempCE32 >> 11) & 0xfe000) | 1402 ((tempCE32 >> 10) & 0x1fc0) | 1403 ((tempCE32 >> 8) & 0x3f); 1404 } 1405 private static boolean isTempCE32(int ce32) { 1406 return 1407 (ce32 & 0xff) >= 2 && // not a long-primary/long-secondary CE32 1408 6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45; 1409 } 1410 1411 private static int ceStrength(long ce) { 1412 return 1413 isTempCE(ce) ? strengthFromTempCE(ce) : 1414 (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY : 1415 ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY : 1416 ce != 0 ? Collator.TERTIARY : 1417 Collator.IDENTICAL; 1418 } 1419 1420 /** At most 1M nodes, limited by the 20 bits in node bit fields. */ 1421 private static final int MAX_INDEX = 0xfffff; 1422 /** 1423 * Node bit 6 is set on a primary node if there are nodes 1424 * with secondary values below the common secondary weight (05). 1425 */ 1426 private static final int HAS_BEFORE2 = 0x40; 1427 /** 1428 * Node bit 5 is set on a primary or secondary node if there are nodes 1429 * with tertiary values below the common tertiary weight (05). 1430 */ 1431 private static final int HAS_BEFORE3 = 0x20; 1432 /** 1433 * Node bit 3 distinguishes a tailored node, which has no weight value, 1434 * from a node with an explicit (root or default) weight. 1435 */ 1436 private static final int IS_TAILORED = 8; 1437 1438 private static long nodeFromWeight32(long weight32) { 1439 return weight32 << 32; 1440 } 1441 private static long nodeFromWeight16(int weight16) { 1442 return (long)weight16 << 48; 1443 } 1444 private static long nodeFromPreviousIndex(int previous) { 1445 return (long)previous << 28; 1446 } 1447 private static long nodeFromNextIndex(int next) { 1448 return next << 8; 1449 } 1450 private static long nodeFromStrength(int strength) { 1451 return strength; 1452 } 1453 1454 private static long weight32FromNode(long node) { 1455 return node >>> 32; 1456 } 1457 private static int weight16FromNode(long node) { 1458 return (int)(node >> 48) & 0xffff; 1459 } 1460 private static int previousIndexFromNode(long node) { 1461 return (int)(node >> 28) & MAX_INDEX; 1462 } 1463 private static int nextIndexFromNode(long node) { 1464 return ((int)node >> 8) & MAX_INDEX; 1465 } 1466 private static int strengthFromNode(long node) { 1467 return (int)node & 3; 1468 } 1469 1470 private static boolean nodeHasBefore2(long node) { 1471 return (node & HAS_BEFORE2) != 0; 1472 } 1473 private static boolean nodeHasBefore3(long node) { 1474 return (node & HAS_BEFORE3) != 0; 1475 } 1476 private static boolean nodeHasAnyBefore(long node) { 1477 return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0; 1478 } 1479 private static boolean isTailoredNode(long node) { 1480 return (node & IS_TAILORED) != 0; 1481 } 1482 1483 private static long changeNodePreviousIndex(long node, int previous) { 1484 return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous); 1485 } 1486 private static long changeNodeNextIndex(long node, int next) { 1487 return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next); 1488 } 1489 1490 private Normalizer2 nfd, fcd; 1491 private Normalizer2Impl nfcImpl; 1492 1493 private CollationTailoring base; 1494 private CollationData baseData; 1495 private CollationRootElements rootElements; 1496 private long variableTop; 1497 1498 private CollationDataBuilder dataBuilder; 1499 private boolean fastLatinEnabled; 1500 private UnicodeSet optimizeSet = new UnicodeSet(); 1501 1502 private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH]; 1503 private int cesLength; 1504 1505 /** 1506 * Indexes of nodes with root primary weights, sorted by primary. 1507 * Compact form of a TreeMap from root primary to node index. 1508 * 1509 * This is a performance optimization for finding reset positions. 1510 * Without this, we would have to search through the entire nodes list. 1511 * It also allows storing root primary weights in list head nodes, 1512 * without previous index, leaving room in root primary nodes for 32-bit primary weights. 1513 */ 1514 private UVector32 rootPrimaryIndexes; 1515 /** 1516 * Data structure for assigning tailored weights and CEs. 1517 * Doubly-linked lists of nodes in mostly collation order. 1518 * Each list starts with a root primary node and ends with a nextIndex of 0. 1519 * 1520 * When there are any nodes in the list, then there is always a root primary node at index 0. 1521 * This allows some code not to have to check explicitly for nextIndex==0. 1522 * 1523 * Root primary nodes have 32-bit weights but do not have previous indexes. 1524 * All other nodes have at most 16-bit weights and do have previous indexes. 1525 * 1526 * Nodes with explicit weights store root collator weights, 1527 * or default weak weights (e.g., secondary 05) for stronger nodes. 1528 * "Tailored" nodes, with the IS_TAILORED bit set, 1529 * do not store explicit weights but rather 1530 * create a difference of a certain strength from the preceding node. 1531 * 1532 * A root node is followed by either 1533 * - a root/default node of the same strength, or 1534 * - a root/default node of the next-weaker strength, or 1535 * - a tailored node of the same strength. 1536 * 1537 * A node of a given strength normally implies "common" weights on weaker levels. 1538 * 1539 * A node with HAS_BEFORE2 must be immediately followed by 1540 * a secondary node with an explicit below-common weight, then a secondary tailored node, 1541 * and later an explicit common-secondary node. 1542 * The below-common weight can be a root weight, 1543 * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight 1544 * or before the lowest root weight. 1545 * (&[before 2] resets to an explicit secondary node so that 1546 * the following addRelation(secondary) tailors right after that. 1547 * If we did not have this node and instead were to reset on the primary node, 1548 * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.) 1549 * 1550 * If the flag is not set, then there are no explicit secondary nodes 1551 * with the common or lower weights. 1552 * 1553 * Same for HAS_BEFORE3 for tertiary nodes and weights. 1554 * A node must not have both flags set. 1555 * 1556 * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs 1557 * which point to stable indexes in this list, 1558 * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes. 1559 * 1560 * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node, 1561 * until the next relation is added. 1562 * 1563 * At the end, the tailored weights are allocated as necessary, 1564 * then the tailored nodes are replaced with final CEs, 1565 * and the CollationData is rewritten by replacing temporary CEs with final ones. 1566 * 1567 * We cannot simply insert new nodes in the middle of the array 1568 * because that would invalidate the indexes stored in existing temporary CEs. 1569 * We need to use a linked graph with stable indexes to existing nodes. 1570 * A doubly-linked list seems easiest to maintain. 1571 * 1572 * Each node is stored as an long, with its fields stored as bit fields. 1573 * 1574 * Root primary node: 1575 * - primary weight: 32 bits 63..32 1576 * - reserved/unused/zero: 4 bits 31..28 1577 * 1578 * Weaker root nodes & tailored nodes: 1579 * - a weight: 16 bits 63..48 1580 * + a root or default weight for a non-tailored node 1581 * + unused/zero for a tailored node 1582 * - index to the previous node: 20 bits 47..28 1583 * 1584 * All types of nodes: 1585 * - index to the next node: 20 bits 27..8 1586 * + nextIndex=0 in last node per root-primary list 1587 * - reserved/unused/zero bits: bits 7, 4, 2 1588 * - HAS_BEFORE2: bit 6 1589 * - HAS_BEFORE3: bit 5 1590 * - IS_TAILORED: bit 3 1591 * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0 1592 * 1593 * We could allocate structs with pointers, but we would have to store them 1594 * in a pointer list so that they can be indexed from temporary CEs, 1595 * and they would require more memory allocations. 1596 */ 1597 private UVector64 nodes; 1598} 1599