17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2013-2014, International Business Machines 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others. All Rights Reserved. 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* CollationBuilder.java, ported from collationbuilder.h/.cpp 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* C++ version created on: 2013may06 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* created by: Markus W. Scherer 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl.coll; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.text.ParseException; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Norm2AllModes; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl; 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl.Hangul; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UScript; 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.CanonicalIterator; 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.Collator; 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.Normalizer2; 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSet; 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSetIterator; 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ULocale; 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class CollationBuilder extends CollationRuleParser.Sink { 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final boolean DEBUG = false; 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class BundleImporter implements CollationRuleParser.Importer { 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert BundleImporter() {} 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public String getRules(String localeID, String collationType) { 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return CollationLoader.loadRules(new ULocale(localeID), collationType); 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CollationBuilder(CollationTailoring b) { 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfd = Normalizer2.getNFDInstance(); 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fcd = Norm2AllModes.getFCDNormalizer2(); 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl = Norm2AllModes.getNFCInstance().impl; 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert base = b; 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseData = b.data; 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rootElements = new CollationRootElements(b.data.rootElements); 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert variableTop = 0; 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataBuilder = new CollationDataBuilder(); 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fastLatinEnabled = true; 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesLength = 0; 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rootPrimaryIndexes = new UVector32(); 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nodes = new UVector64(); 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfcImpl.ensureCanonIterData(); 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataBuilder.initForTailoring(baseData); 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public CollationTailoring parseAndBuild(String ruleString) throws ParseException { 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(baseData.rootElements == null) { 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // C++ U_MISSING_RESOURCE_ERROR 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "missing root elements data, tailoring not supported"); 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationTailoring tailoring = new CollationTailoring(base.settings); 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationRuleParser parser = new CollationRuleParser(baseData); 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: This always bases &[last variable] and &[first regular] 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // on the root collator's maxVariable/variableTop. 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we wanted this to change after [maxVariable x], then we would keep 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the tailoring.settings pointer here and read its variableTop when we need it. 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // See http://unicode.org/cldr/trac/ticket/6070 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert variableTop = base.settings.readOnly().variableTop; 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert parser.setSink(this); 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, there is only one Importer implementation. 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In C++, the importer is a parameter for this method. 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert parser.setImporter(new BundleImporter()); 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationSettings ownedSettings = tailoring.settings.copyOnWrite(); 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert parser.parse(ruleString, ownedSettings); 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(dataBuilder.hasMappings()) { 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert makeTailoredCEs(); 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert closeOverComposites(); 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert finalizeCEs(); 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Copy all of ASCII, and Latin-1 letters, into each tailoring. 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert optimizeSet.add(0, 0x7f); 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert optimizeSet.add(0xc0, 0xff); 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hangul is decomposed on the fly during collation, 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and the tailoring data is always built with HANGUL_TAG specials. 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataBuilder.optimize(optimizeSet); 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailoring.ensureOwnedData(); 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(fastLatinEnabled) { dataBuilder.enableFastLatin(); } 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataBuilder.build(tailoring.ownedData); 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // C++ tailoring.builder = dataBuilder; 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataBuilder = null; 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailoring.data = baseData; 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ownedSettings.fastLatinOptions = CollationFastLatin.getOptions( 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailoring.data, ownedSettings, 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ownedSettings.fastLatinPrimaries); 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailoring.rules = ruleString; 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In Java, we do not have a rules version. 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // In C++, the genrb build tool reads and supplies one, 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and the rulesVersion is a parameter for this method. 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailoring.setVersion(base.version, 0 /* rulesVersion */); 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return tailoring; 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Implements CollationRuleParser.Sink. */ 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void addReset(int strength, CharSequence str) { 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(str.length() != 0); 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(str.charAt(0) == CollationRuleParser.POS_LEAD) { 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ces[0] = getSpecialResetPosition(str); 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesLength = 1; 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0); 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // normal reset to a character or string 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String nfdString = nfd.normalize(str); 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesLength = dataBuilder.getCEs(nfdString, ces, 0); 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException( 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "reset position maps to too many collation elements (more than 31)"); 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.IDENTICAL) { return; } // simple reset-at-position 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // &[before strength]position 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY); 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = findOrInsertNodeForCEs(strength); 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodes.elementAti(index); 127f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // If the index is for a "weaker" node, 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // then skip backwards over this and further "weaker" nodes. 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(strengthFromNode(node) > strength) { 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = previousIndexFromNode(node); 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find or insert a node whose index we will put into a temporary CE. 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strengthFromNode(node) == strength && isTailoredNode(node)) { 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Reset to just before this same-strength tailored node. 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = previousIndexFromNode(node); 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(strength == Collator.PRIMARY) { 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // root primary node (has no previous index) 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long p = weight32FromNode(node); 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p == 0) { 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "reset primary-before ignorable not possible"); 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p <= rootElements.getFirstPrimary()) { 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // There is no primary gap between ignorables and the space-first-primary. 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "reset primary-before first non-ignorable not supported"); 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(p == Collation.FIRST_TRAILING_PRIMARY) { 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We do not support tailoring to an unassigned-implicit CE. 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "reset primary-before [first trailing] not supported"); 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p)); 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = findOrInsertNodeForPrimary(p); 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Go to the last node in this list: 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Tailor after the last node between adjacent root nodes. 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextIndex = nextIndexFromNode(node); 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextIndex == 0) { break; } 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndex; 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // &[before 2] or &[before 3] 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = findCommonNode(index, Collator.SECONDARY); 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength >= Collator.TERTIARY) { 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = findCommonNode(index, Collator.TERTIARY); 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 171f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // findCommonNode() stayed on the stronger node or moved to 172f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // an explicit common-weight node of the reset-before strength. 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strengthFromNode(node) == strength) { 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Found a same-strength node with an explicit weight. 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int weight16 = weight16FromNode(node); 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(weight16 == 0) { 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (strength == Collator.SECONDARY) ? 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "reset secondary-before secondary ignorable not possible" : 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "reset tertiary-before completely ignorable not possible"); 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 183f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(weight16 > Collation.BEFORE_WEIGHT16); 184f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Reset to just before this node. 185f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Insert the preceding same-level explicit weight if it is not there already. 186f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Which explicit weight immediately precedes this one? 187f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert weight16 = getWeight16Before(index, node, strength); 188f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Does this preceding weight have a node? 189f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int previousWeight16; 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int previousIndex = previousIndexFromNode(node); 191f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert for(int i = previousIndex;; i = previousIndexFromNode(node)) { 192f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert node = nodes.elementAti(i); 193f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int previousStrength = strengthFromNode(node); 194f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(previousStrength < strength) { 195f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex); 196f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Either the reset element has an above-common weight and 197f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // the parent node provides the implied common weight, 198f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // or the reset element has a weight<=common in the node 199f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // right after the parent, and we need to insert the preceding weight. 200f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert previousWeight16 = Collation.COMMON_WEIGHT16; 201f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert break; 202f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else if(previousStrength == strength && !isTailoredNode(node)) { 203f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert previousWeight16 = weight16FromNode(node); 204f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert break; 205f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 206f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Skip weaker nodes and same-level tailored nodes. 207f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 208f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(previousWeight16 == weight16) { 209f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // The preceding weight has a node, 210f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // maybe with following weaker or tailored nodes. 211f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Reset to the last of them. 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = previousIndex; 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 214f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Insert a node with the preceding weight, reset to that. 215f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 216f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert index = insertNodeBetween(previousIndex, index, node); 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Found a stronger node with implied strength-common weight. 220f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int weight16 = getWeight16Before(index, node, strength); 221f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert index = findOrInsertWeakNode(index, weight16, strength); 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Strength of the temporary CE = strength of its reset position. 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Code above raises an error if the before-strength is stronger. 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strength = ceStrength(ces[cesLength - 1]); 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 230f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert /** 231f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * Returns the secondary or tertiary weight preceding the current node's weight. 232f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * node=nodes[index]. 233f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert */ 234f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert private int getWeight16Before(int index, long node, int level) { 235f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(strengthFromNode(node) < level || !isTailoredNode(node)); 236f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Collect the root CE weights if this node is for a root CE. 237f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // If it is not, then return the low non-primary boundary for a tailored CE. 238f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int t; 239f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(strengthFromNode(node) == Collator.TERTIARY) { 240f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert t = weight16FromNode(node); 241f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else { 242f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert t = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. 243f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 244f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert while(strengthFromNode(node) > Collator.SECONDARY) { 245f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert index = previousIndexFromNode(node); 246f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert node = nodes.elementAti(index); 247f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 248f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(isTailoredNode(node)) { 249f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return Collation.BEFORE_WEIGHT16; 250f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 251f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int s; 252f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(strengthFromNode(node) == Collator.SECONDARY) { 253f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert s = weight16FromNode(node); 254f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else { 255f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert s = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. 256f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 257f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert while(strengthFromNode(node) > Collator.PRIMARY) { 258f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert index = previousIndexFromNode(node); 259f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert node = nodes.elementAti(index); 260f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 261f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(isTailoredNode(node)) { 262f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return Collation.BEFORE_WEIGHT16; 263f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 264f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // [p, s, t] is a root CE. Return the preceding weight for the requested level. 265f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert long p = weight32FromNode(node); 266f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int weight16; 267f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(level == Collator.SECONDARY) { 268f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert weight16 = rootElements.getSecondaryBefore(p, s); 269f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else { 270f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert weight16 = rootElements.getTertiaryBefore(p, s, t); 271f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0); 272f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 273f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return weight16; 274f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 275f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private long getSpecialResetPosition(CharSequence str) { 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(str.length() == 2); 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long ce; 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int strength = Collator.PRIMARY; 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isBoundary = false; 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationRuleParser.Position pos = 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE]; 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert switch(pos) { 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case FIRST_TERTIARY_IGNORABLE: 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Quaternary CEs are not supported. 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case LAST_TERTIARY_IGNORABLE: 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case FIRST_SECONDARY_IGNORABLE: { 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look for a tailored tertiary node after [0, 0, 0]. 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY); 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodes.elementAti(index); 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((index = nextIndexFromNode(node)) != 0) { 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(strengthFromNode(node) <= Collator.TERTIARY); 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) { 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return tempCEFromIndexAndStrength(index, Collator.TERTIARY); 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return rootElements.getFirstTertiaryCE(); 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No need to look for nodeHasAnyBefore() on a tertiary node. 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case LAST_SECONDARY_IGNORABLE: 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = rootElements.getLastTertiaryCE(); 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strength = Collator.TERTIARY; 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case FIRST_PRIMARY_IGNORABLE: { 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look for a tailored secondary node after [0, 0, *]. 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY); 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodes.elementAti(index); 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((index = nextIndexFromNode(node)) != 0) { 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strength = strengthFromNode(node); 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength < Collator.SECONDARY) { break; } 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.SECONDARY) { 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTailoredNode(node)) { 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nodeHasBefore3(node)) { 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(isTailoredNode(nodes.elementAti(index))); 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return tempCEFromIndexAndStrength(index, Collator.SECONDARY); 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = rootElements.getFirstSecondaryCE(); 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strength = Collator.SECONDARY; 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case LAST_PRIMARY_IGNORABLE: 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = rootElements.getLastSecondaryCE(); 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert strength = Collator.SECONDARY; 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case FIRST_VARIABLE: 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = rootElements.getFirstPrimaryCE(); 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isBoundary = true; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case LAST_VARIABLE: 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case FIRST_REGULAR: 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isBoundary = true; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case LAST_REGULAR: 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Use the Hani-first-primary rather than the actual last "regular" CE before it, 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for backward compatibility with behavior before the introduction of 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // script-first-primary CEs in the root collator. 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = rootElements.firstCEWithPrimaryAtLeast( 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseData.getFirstPrimaryForGroup(UScript.HAN)); 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case FIRST_IMPLICIT: 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = baseData.getSingleCE(0x4e00); 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case LAST_IMPLICIT: 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We do not support tailoring to an unassigned-implicit CE. 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "reset to [last implicit] not supported"); 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case FIRST_TRAILING: 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY); 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isBoundary = true; // trailing first primary (there is no mapping for it) 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert case LAST_TRAILING: 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF"); 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert default: 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(false); 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = findOrInsertNodeForRootCE(ce, strength); 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodes.elementAti(index); 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((pos.ordinal() & 1) == 0) { 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // even pos = [first xyz] 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!nodeHasAnyBefore(node) && isBoundary) { 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // A <group> first primary boundary is artificially added to FractionalUCA.txt. 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // It is reachable via its special contraction, but is not normally used. 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find the first character tailored after the boundary CE, 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or the first real root CE after it. 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((index = nextIndexFromNode(node)) != 0) { 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If there is a following node, then it must be tailored 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // because there are no root CEs with a boundary primary 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and non-common secondary/tertiary weights. 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(isTailoredNode(node)); 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = tempCEFromIndexAndStrength(index, strength); 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(strength == Collator.PRIMARY); 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long p = ce >>> 32; 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pIndex = rootElements.findPrimary(p); 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isCompressible = baseData.isCompressiblePrimary(p); 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = Collation.makeCE(p); 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY); 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nodeHasAnyBefore(node)) { 4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Get the first node that was tailored before this one at a weaker strength. 4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nodeHasBefore2(node)) { 4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nodeHasBefore3(node)) { 4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(isTailoredNode(nodes.elementAti(index))); 4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = tempCEFromIndexAndStrength(index, strength); 4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // odd pos = [last xyz] 4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find the last node that was tailored after the [last xyz] 4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // at a strength no greater than the position's strength. 4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextIndex = nextIndexFromNode(node); 4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextIndex == 0) { break; } 4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long nextNode = nodes.elementAti(nextIndex); 4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strengthFromNode(nextNode) < strength) { break; } 4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndex; 4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nextNode; 4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not make a temporary CE for a root node. 4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // This last node might be the node for the root CE itself, 4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or a node with a common secondary or tertiary weight. 4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTailoredNode(node)) { 4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = tempCEFromIndexAndStrength(index, strength); 4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce; 4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Implements CollationRuleParser.Sink. */ 4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java 6: @Override 4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) { 4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String nfdPrefix; 4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prefix.length() == 0) { 4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfdPrefix = ""; 4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nfdPrefix = nfd.normalize(prefix); 4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String nfdString = nfd.normalize(str); 4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The runtime code decomposes Hangul syllables on the fly, 4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // with recursive processing but without making the Jamo pieces visible for matching. 4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // It does not work with certain types of contextual mappings. 4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nfdLength = nfdString.length(); 4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nfdLength >= 2) { 4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert char c = nfdString.charAt(0); 4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) { 4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // While handling a Hangul syllable, contractions starting with Jamo L or V 4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // would not see the following Jamo of that syllable. 4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "contractions starting with conjoining Jamo L or V not supported"); 4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert c = nfdString.charAt(nfdLength - 1); 4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Hangul.isJamoL(c) || 4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) { 4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // A contraction ending with Jamo L or L+V would require 4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or decomposing a following Hangul syllable on the fly, during contraction matching. 4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "contractions ending with conjoining Jamo L or L+V not supported"); 4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // A Hangul syllable completely inside a contraction is ok. 4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: If there is a prefix, then the parser checked that 4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (While handling a Hangul syllable, prefixes on Jamo V or T 4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // would not see the previous Jamo of that syllable.) 4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength != Collator.IDENTICAL) { 4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find the node index after which we insert the new tailored node. 4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = findOrInsertNodeForCEs(strength); 4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(cesLength > 0); 4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long ce = ces[cesLength - 1]; 4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) { 4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // There is no primary gap between ignorables and the space-first-primary. 4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "tailoring primary after ignorables not supported"); 4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.QUATERNARY && ce == 0) { 4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The CE data structure does not support non-zero quaternary weights 4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // on tertiary ignorables. 4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "tailoring quaternary after tertiary ignorables not supported"); 4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Insert the new tailored node. 4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = insertTailoredNodeAfter(index, strength); 4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Strength of the temporary CE: 4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The new relation may yield a stronger CE but not a weaker one. 4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int tempStrength = ceStrength(ce); 4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength < tempStrength) { tempStrength = strength; } 4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setCaseBits(nfdString); 4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cesLengthBeforeExtension = cesLength; 5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(extension.length() != 0) { 5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String nfdExtension = nfd.normalize(extension); 5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength); 5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new IllegalArgumentException( 5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "extension string adds too many collation elements (more than 31 total)"); 5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce32 = Collation.UNASSIGNED_CE32; 5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) && 5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert !ignorePrefix(prefix) && !ignoreString(str)) { 5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Map from the original input to the CEs. 5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We do this in case the canonical closure is incomplete, 5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // so that it is possible to explicitly provide the missing mappings. 5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32); 5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32); 5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesLength = cesLengthBeforeExtension; 5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Picks one of the current CEs and finds or inserts a node in the graph 5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * for the CE + strength. 5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findOrInsertNodeForCEs(int strength) { 5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY); 5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find the last CE that is at least as "strong" as the requested difference. 5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: Stronger is smaller (Collator.PRIMARY=0). 5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long ce; 5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;; --cesLength) { 5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cesLength == 0) { 5337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = ces[0] = 0; 5347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesLength = 1; 5357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 5367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce = ces[cesLength - 1]; 5387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ceStrength(ce) <= strength) { break; } 5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTempCE(ce)) { 5437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No need to findCommonNode() here for lower levels 5447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // because insertTailoredNodeAfter() will do that anyway. 5457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return indexFromTempCE(ce); 5467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // root CE 5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) { 5507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException( 5517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert "tailoring relative to an unassigned code point not supported"); 5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return findOrInsertNodeForRootCE(ce, strength); 5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findOrInsertNodeForRootCE(long ce, int strength) { 5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE); 5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find or insert the node for each of the root CE's weights, 5607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // down to the requested level/strength. 5617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 5627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert((ce & 0xc0) == 0); 563f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int index = findOrInsertNodeForPrimary(ce >>> 32); 5647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength >= Collator.SECONDARY) { 5657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lower32 = (int)ce; 5667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY); 5677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength >= Collator.TERTIARY) { 5687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK, 5697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Collator.TERTIARY); 5707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return index; 5737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 5757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 5767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Like Java Collections.binarySearch(List, key, Comparator). 5777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 5787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the index>=0 where the item was found, 5797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or the index<0 for inserting the string at ~index in sorted order 5807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * (index into rootPrimaryIndexes) 5817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 5827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int binarySearchForRootPrimaryNode( 5837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int[] rootPrimaryIndexes, int length, long[] nodes, long p) { 5847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(length == 0) { return ~0; } 5857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int start = 0; 5867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int limit = length; 5877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;;) { 5887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i = (start + limit) / 2; 5897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodes[rootPrimaryIndexes[i]]; 5907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long nodePrimary = node >>> 32; // weight32FromNode(node) 5917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (p == nodePrimary) { 5927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return i; 5937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (p < nodePrimary) { 5947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (i == start) { 5957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ~start; // insert s before i 5967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 5977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert limit = i; 5987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 5997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (i == start) { 6007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ~(start + 1); // insert s after i 6017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert start = i; 6037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Finds or inserts the node for a root CE's primary weight. */ 6087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findOrInsertNodeForPrimary(long p) { 6097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int rootIndex = binarySearchForRootPrimaryNode( 6107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 6117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(rootIndex >= 0) { 6127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return rootPrimaryIndexes.elementAti(rootIndex); 6137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 6147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Start a new list of nodes with this primary. 6157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int index = nodes.size(); 6167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nodes.addElement(nodeFromWeight32(p)); 6177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rootPrimaryIndexes.insertElementAt(index, ~rootIndex); 6187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return index; 6197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Finds or inserts the node for a secondary or tertiary weight. */ 6237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findOrInsertWeakNode(int index, int weight16, int level) { 6247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(0 <= index && index < nodes.size()); 625f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY); 6267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(weight16 == Collation.COMMON_WEIGHT16) { 6287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return findCommonNode(index, level); 6297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 630f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 631f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // If this will be the first below-common weight for the parent node, 632f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // then we will also need to insert a common weight after it. 633f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert long node = nodes.elementAti(index); 634f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert assert(strengthFromNode(node) < level); // parent node is stronger 635f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) { 636f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3; 637f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if((node & hasThisLevelBefore) == 0) { 638f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // The parent node has an implied level-common weight. 639f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert long commonNode = 640f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level); 641f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(level == Collator.SECONDARY) { 642f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Move the HAS_BEFORE3 flag from the parent node 643f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // to the new secondary common node. 644f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert commonNode |= node & HAS_BEFORE3; 645f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert node &= ~(long)HAS_BEFORE3; 646f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 647f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert nodes.setElementAt(node | hasThisLevelBefore, index); 648f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Insert below-common-weight node. 649f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert int nextIndex = nextIndexFromNode(node); 650f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert node = nodeFromWeight16(weight16) | nodeFromStrength(level); 651f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert index = insertNodeBetween(index, nextIndex, node); 652f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Insert common-weight node. 653f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert insertNodeBetween(index, nextIndex, commonNode); 654f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert // Return index of below-common-weight node. 655f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert return index; 656f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 657f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 658f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert 6597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Find the root CE's weight for this level. 6607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Postpone insertion if not found: 6617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Insert the new root node before the next stronger node, 6627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or before the next root node with the same strength and a larger weight. 6637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextIndex; 6647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((nextIndex = nextIndexFromNode(node)) != 0) { 6657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(nextIndex); 6667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextStrength = strengthFromNode(node); 6677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextStrength <= level) { 6687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Insert before a stronger node. 6697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextStrength < level) { break; } 6707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // nextStrength == level 6717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!isTailoredNode(node)) { 6727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextWeight16 = weight16FromNode(node); 6737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextWeight16 == weight16) { 6747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Found the node for the root CE up to this level. 6757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return nextIndex; 6767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Insert before a node with a larger same-strength weight. 6787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextWeight16 > weight16) { break; } 6797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip the next node. 6827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndex; 6837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodeFromWeight16(weight16) | nodeFromStrength(level); 6857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return insertNodeBetween(index, nextIndex, node); 6867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 6877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 6897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Makes and inserts a new tailored node into the list, after the one at index. 6907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Skips over nodes of weaker strength to maintain collation order 6917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * ("postpone insertion"). 6927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the new node's index 6937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 6947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int insertTailoredNodeAfter(int index, int strength) { 6957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(0 <= index && index < nodes.size()); 6967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength >= Collator.SECONDARY) { 6977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = findCommonNode(index, Collator.SECONDARY); 6987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength >= Collator.TERTIARY) { 6997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = findCommonNode(index, Collator.TERTIARY); 7007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Postpone insertion: 7037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Insert the new node before the next one with a strength at least as strong. 7047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodes.elementAti(index); 7057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextIndex; 7067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((nextIndex = nextIndexFromNode(node)) != 0) { 7077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(nextIndex); 7087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strengthFromNode(node) <= strength) { break; } 7097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip the next node which has a weaker (larger) strength than the new one. 7107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndex; 7117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = IS_TAILORED | nodeFromStrength(strength); 7137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return insertNodeBetween(index, nextIndex, node); 7147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Inserts a new node into the list, between list-adjacent items. 7187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The node's previous and next indexes must not be set yet. 7197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return the new node's index 7207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int insertNodeBetween(int index, int nextIndex, long node) { 7227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(previousIndexFromNode(node) == 0); 7237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(nextIndexFromNode(node) == 0); 7247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 7257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Append the new node and link it to the existing nodes. 7267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int newIndex = nodes.size(); 7277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 7287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nodes.addElement(node); 7297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // nodes[index].nextIndex = newIndex 7307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 7317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 7327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // nodes[nextIndex].previousIndex = newIndex 7337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nextIndex != 0) { 7347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(nextIndex); 7357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 7367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return newIndex; 7387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 7417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Finds the node which implies or contains a common=05 weight of the given strength 742f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * (secondary or tertiary), if the current node is stronger. 7437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Skips weaker nodes and tailored nodes if the current node is stronger 7447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and is followed by an explicit-common-weight node. 7457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Always returns the input index if that node is no stronger than the given strength. 7467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 7477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int findCommonNode(int index, int strength) { 7487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY); 7497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodes.elementAti(index); 7507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strengthFromNode(node) >= strength) { 7517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The current node is no stronger. 7527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return index; 7537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 7557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The current node implies the strength-common weight. 7567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return index; 7577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndexFromNode(node); 7597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 7607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(!isTailoredNode(node) && strengthFromNode(node) == strength && 761f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert weight16FromNode(node) < Collation.COMMON_WEIGHT16); 7627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Skip to the explicit common node. 7637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 7647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert index = nextIndexFromNode(node); 7657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodes.elementAti(index); 7667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(strengthFromNode(node) >= strength); 767f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } while(isTailoredNode(node) || strengthFromNode(node) > strength || 768f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert weight16FromNode(node) < Collation.COMMON_WEIGHT16); 7697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16); 7707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return index; 7717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void setCaseBits(CharSequence nfdString) { 7747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numTailoredPrimaries = 0; 7757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 0; i < cesLength; ++i) { 7767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; } 7777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 7787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We should not be able to get too many case bits because 7797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // cesLength<=31==MAX_EXPANSION_LENGTH. 7807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // 31 pairs of case bits fit into an long without setting its sign bit. 7817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(numTailoredPrimaries <= 31); 7827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long cases = 0; 7847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numTailoredPrimaries > 0) { 7857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharSequence s = nfdString; 7867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0); 7877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseCEsLength = baseCEs.fetchCEs() - 1; 7887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE); 7897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 7907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastCase = 0; 7917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int numBasePrimaries = 0; 7927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 0; i < baseCEsLength; ++i) { 7937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long ce = baseCEs.getCE(i); 7947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((ce >>> 32) != 0) { 7957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++numBasePrimaries; 7967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int c = ((int)ce >> 14) & 3; 7977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 7987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numBasePrimaries < numTailoredPrimaries) { 7997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cases |= (long)c << ((numBasePrimaries - 1) * 2); 8007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(numBasePrimaries == numTailoredPrimaries) { 8017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCase = c; 8027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(c != lastCase) { 8037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // There are more base primary CEs than tailored primaries. 8047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Set mixed case if the case bits of the remainder differ. 8057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastCase = 1; 8067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Nothing more can change. 8077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 8087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(numBasePrimaries >= numTailoredPrimaries) { 8127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2); 8137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 0; i < cesLength; ++i) { 8177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long ce = ces[i] & 0xffffffffffff3fffL; // clear old case bits 8187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int strength = ceStrength(ce); 8197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.PRIMARY) { 8207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce |= (cases & 3) << 14; 8217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cases >>>= 2; 8227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(strength == Collator.TERTIARY) { 8237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Tertiary CEs must have uppercase bits. 8247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // See the LDML spec, and comments in class CollationCompare. 8257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce |= 0x8000; 8267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Tertiary ignorable CEs must have 0 case bits. 8287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We set 0 case bits for secondary CEs too 8297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // since currently only U+0345 is cased and maps to a secondary CE, 8307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and it is lowercase. Other secondaries are uncased. 8317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 8327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ces[i] = ce; 8337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Implements CollationRuleParser.Sink. */ 8377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 8387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void suppressContractions(UnicodeSet set) { 8397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataBuilder.suppressContractions(set); 8407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Implements CollationRuleParser.Sink. */ 8437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert @Override 8447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert void optimize(UnicodeSet set) { 8457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert optimizeSet.addAll(set); 8467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 8497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Adds the mapping and its canonical closure. 8507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder 8517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * need not re-encode the CEs multiple times. 8527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 8537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString, 8547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] newCEs, int newCEsLength, int ce32) { 8557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Map from the NFD input to the CEs. 8567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); 8577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); 8587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addTailComposites(nfdPrefix, nfdString); 8597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 8607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString, 8637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] newCEs, int newCEsLength, int ce32) { 8647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 8657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String 8667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nfdPrefix.length() == 0) { 8677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); 8687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String prefix = ""; 8697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 8707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String str = stringIter.next(); 8717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(str == null) { break; } 8727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ignoreString(str) || str.contentEquals(nfdString)) { continue; } 8737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); 8747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 8767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString()); 8777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); 8787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 8797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String prefix = prefixIter.next(); 8807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(prefix == null) { break; } 8817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ignorePrefix(prefix)) { continue; } 8827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean samePrefix = prefix.contentEquals(nfdPrefix); 8837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 8847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String str = stringIter.next(); 8857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(str == null) { break; } 8867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; } 8877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); 8887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert stringIter.reset(); 8907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 8937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 8947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 8957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) { 8967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look for the last starter in the NFD string. 8977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastStarter; 8987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int indexAfterLastStarter = nfdString.length(); 8997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 9007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(indexAfterLastStarter == 0) { return; } // no starter at all 9017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter); 9027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(nfd.getCombiningClass(lastStarter) == 0) { break; } 9037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert indexAfterLastStarter -= Character.charCount(lastStarter); 9047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // No closure to Hangul syllables since we decompose them on the fly. 9067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(Hangul.isJamoL(lastStarter)) { return; } 9077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Are there any composites whose decomposition starts with the lastStarter? 9097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 9107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We might find some more equivalent mappings here if it did. 9117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSet composites = new UnicodeSet(); 9127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 9137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder(); 9157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 9167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSetIterator iter = new UnicodeSetIterator(composites); 9177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(iter.next()) { 9187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 9197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int composite = iter.codepoint; 9207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String decomp = nfd.getDecomposition(composite); 9217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 9227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newNFDString, newString)) { 9237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 9247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0); 9267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) { 9277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Ignore mappings that we cannot store. 9287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 9297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Note: It is possible that the newCEs do not make use of the mapping 9317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for which we are adding the tail composites, in which case we might be adding 9327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // unnecessary mappings. 9337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // For example, when we add tail composites for ae^ (^=combining circumflex), 9347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // UCA discontiguous-contraction matching does not find any matches 9357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // for ae_^ (_=any combining diacritic below) *unless* there is also 9367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // a contraction mapping for ae. 9377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Thus, if there is no ae contraction, then the ae^ mapping is ignored 9387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // while fetching the newCEs for ae_^. 9397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // TODO: Try to detect this effectively. 9407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // (Alternatively, print a warning when prefix contractions are missing.) 9417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We do not need an explicit mapping for the NFD strings. 9437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // It is fine if the NFD input collates like this via a sequence of mappings. 9447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // It also saves a little bit of space, and may reduce the set of characters with contractions. 9457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int ce32 = addIfDifferent(nfdPrefix, newString, 9467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newCEs, newCEsLength, Collation.UNASSIGNED_CE32); 9477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 != Collation.UNASSIGNED_CE32) { 9487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // was different, was added 9497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32); 9507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter, 9557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int composite, CharSequence decomp, 9567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder newNFDString, StringBuilder newString) { 9577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(Character.codePointBefore(nfdString, indexAfterLastStarter) == 9587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Character.codePointAt(decomp, 0)); 9597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1); 9607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(lastStarterLength == decomp.length()) { 9617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Singleton decompositions should be found by addWithClosure() 9627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and the CanonicalIterator, so we can ignore them here. 9637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 9647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) { 9667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // same strings, nothing new to be found here 9677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 9687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Make new FCD strings that combine a composite, or its decomposition, 9717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // into the nfdString's last starter and the combining marks following it. 9727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Make an NFD version, and a version with the composite. 9737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newNFDString.setLength(0); 9747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newNFDString.append(nfdString, 0, indexAfterLastStarter); 9757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newString.setLength(0); 9767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength) 9777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert .appendCodePoint(composite); 9787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 9797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The following is related to discontiguous contraction matching, 9807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but builds only FCD strings (or else returns false). 9817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int sourceIndex = indexAfterLastStarter; 9827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int decompIndex = lastStarterLength; 9837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Small optimization: We keep the source character across loop iterations 9847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // because we do not always consume it, 9857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and then need not fetch it again nor look up its combining class again. 9867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int sourceChar = Collation.SENTINEL_CP; 9877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The cc variables need to be declared before the loop so that at the end 9887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // they are set to the last combining classes seen. 9897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int sourceCC = 0; 9907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int decompCC = 0; 9917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 9927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(sourceChar < 0) { 9937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(sourceIndex >= nfdString.length()) { break; } 9947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sourceChar = Character.codePointAt(nfdString, sourceIndex); 9957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sourceCC = nfd.getCombiningClass(sourceChar); 9967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(sourceCC != 0); 9977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 9987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We consume a decomposition character in each iteration. 9997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(decompIndex >= decomp.length()) { break; } 10007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int decompChar = Character.codePointAt(decomp, decompIndex); 10017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompCC = nfd.getCombiningClass(decompChar); 10027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Compare the two characters and their combining classes. 10037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(decompCC == 0) { 10047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Unable to merge because the source contains a non-zero combining mark 10057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but the composite's decomposition contains another starter. 10067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The strings would not be equivalent. 10077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 10087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(sourceCC < decompCC) { 10097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Composite + sourceChar would not be FCD. 10107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 10117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(decompCC < sourceCC) { 10127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newNFDString.appendCodePoint(decompChar); 10137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompIndex += Character.charCount(decompChar); 10147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(decompChar != sourceChar) { 10157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Blocked because same combining class. 10167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 10177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // match: decompChar == sourceChar 10187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newNFDString.appendCodePoint(decompChar); 10197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert decompIndex += Character.charCount(decompChar); 10207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sourceIndex += Character.charCount(decompChar); 10217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sourceChar = Collation.SENTINEL_CP; 10227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We are at the end of at least one of the two inputs. 10257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(sourceChar >= 0) { // more characters from nfdString but not from decomp 10267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(sourceCC < decompCC) { 10277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Appending the next source character to the composite would not be FCD. 10287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 10297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newNFDString.append(nfdString, sourceIndex, nfdString.length()); 10317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newString.append(nfdString, sourceIndex, nfdString.length()); 10327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 10337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newNFDString.append(decomp, decompIndex, decomp.length()); 10347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(nfd.isNormalized(newNFDString)); 10367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(fcd.isNormalized(newString)); 10377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(nfd.normalize(newString).equals(newNFDString.toString())); // canonically equivalent 10387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 10397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) { 10427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0 10437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int leftLength = left.length(); 10447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if((leftLength - leftStart) != (right.length() - rightStart)) { return false; } 10457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(leftStart < leftLength) { 10467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(left.charAt(leftStart++) != right.charAt(rightStart++)) { 10477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 10487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 10517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean ignorePrefix(CharSequence s) { 10537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not map non-FCD prefixes. 10547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return !isFCD(s); 10557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean ignoreString(CharSequence s) { 10577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not map non-FCD strings. 10587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not map strings that start with Hangul syllables: We decompose those on the fly. 10597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return !isFCD(s) || Hangul.isHangul(s.charAt(0)); 10607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean isFCD(CharSequence s) { 10627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return fcd.isNormalized(s); 10637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]"); 10667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static { 10677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Hangul is decomposed on the fly during collation. 10687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); 10697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void closeOverComposites() { 10727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String prefix = ""; // empty 10737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES); 10747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(iter.next()) { 10757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 10767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String nfdString = nfd.getDecomposition(iter.codepoint); 10777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert cesLength = dataBuilder.getCEs(nfdString, ces, 0); 10787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 10797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Too many CEs from the decomposition (unusual), ignore this composite. 10807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We could add a capacity parameter to getCEs() and reallocate if necessary. 10817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // However, this can only really happen in contrived cases. 10827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert continue; 10837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String composite = iter.getString(); 10857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32); 10867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 10897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int addIfDifferent(CharSequence prefix, CharSequence str, 10907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] newCEs, int newCEsLength, int ce32) { 10917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 10927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0); 10937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 10947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ce32 == Collation.UNASSIGNED_CE32) { 10957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength); 10967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataBuilder.addCE32(prefix, str, ce32); 10987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 10997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ce32; 11007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean sameCEs(long[] ces1, int ces1Length, 11037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] ces2, int ces2Length) { 11047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ces1Length != ces2Length) { 11057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 11067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH); 11087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int i = 0; i < ces1Length; ++i) { 11097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(ces1[i] != ces2[i]) { return false; } 11107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return true; 11127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int alignWeightRight(int w) { 11157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(w != 0) { 11167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while((w & 0xff) == 0) { w >>>= 8; } 11177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return w; 11197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 11227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Walks the tailoring graph and overwrites tailored nodes with new CEs. 11237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * After this, the graph is destroyed. 11247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The nodes array can then be used only as a source of tailored CEs. 11257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 11267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void makeTailoredCEs() { 11277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationWeights primaries = new CollationWeights(); 11287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationWeights secondaries = new CollationWeights(); 11297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationWeights tertiaries = new CollationWeights(); 11307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long[] nodesArray = nodes.getBuffer(); 1131f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(DEBUG) { 1132f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert System.out.println("\nCollationBuilder.makeTailoredCEs()"); 1133f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 11347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 11357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 11367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int i = rootPrimaryIndexes.elementAti(rpi); 11377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodesArray[i]; 11387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long p = weight32FromNode(node); 11397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16; 11407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int t = s; 11417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int q = 0; 11427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean pIsTailored = false; 11437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean sIsTailored = false; 11447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean tIsTailored = false; 11457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(DEBUG) { 11467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.printf("\nprimary %x\n", alignWeightRight((int)p)); 11477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 11497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int nextIndex = nextIndexFromNode(node); 11507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while(nextIndex != 0) { 11517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert i = nextIndex; 11527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert node = nodesArray[i]; 11537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nextIndex = nextIndexFromNode(node); 11547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int strength = strengthFromNode(node); 11557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.QUATERNARY) { 11567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(isTailoredNode(node)); 11577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(DEBUG) { 11587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.print(" quat+ "); 11597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(q == 3) { 11617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // C++ U_BUFFER_OVERFLOW_ERROR 11627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException("quaternary tailoring gap too small"); 11637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++q; 11657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 11667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.TERTIARY) { 11677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTailoredNode(node)) { 11687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(DEBUG) { 11697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.print(" ter+ "); 11707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!tIsTailored) { 11727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // First tailored tertiary node for [p, s]. 11737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int tCount = countTailoredNodes(nodesArray, nextIndex, 11747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Collator.TERTIARY) + 1; 11757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int tLimit; 11767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(t == 0) { 11777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Gap at the beginning of the tertiary CE range. 11787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert t = rootElements.getTertiaryBoundary() - 0x100; 11797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK; 11807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(!pIsTailored && !sIsTailored) { 11817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // p and s are root weights. 11827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 1183f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else if(t == Collation.BEFORE_WEIGHT16) { 1184f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert tLimit = Collation.COMMON_WEIGHT16; 11857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 11867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // [p, s] is tailored. 11877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(t == Collation.COMMON_WEIGHT16); 11887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tLimit = rootElements.getTertiaryBoundary(); 11897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0); 11917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tertiaries.initForTertiary(); 11927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!tertiaries.allocWeights(t, tLimit, tCount)) { 11937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // C++ U_BUFFER_OVERFLOW_ERROR 11947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException("tertiary tailoring gap too small"); 11957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tIsTailored = true; 11977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 11987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert t = (int)tertiaries.nextWeight(); 11997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(t != 0xffffffff); 12007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 12017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert t = weight16FromNode(node); 12027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tIsTailored = false; 12037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(DEBUG) { 12047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.printf(" ter %x\n", alignWeightRight(t)); 12057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 12087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strength == Collator.SECONDARY) { 12097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTailoredNode(node)) { 12107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(DEBUG) { 12117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.print(" sec+ "); 12127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!sIsTailored) { 12147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // First tailored secondary node for p. 12157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int sCount = countTailoredNodes(nodesArray, nextIndex, 12167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Collator.SECONDARY) + 1; 12177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int sLimit; 12187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s == 0) { 12197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Gap at the beginning of the secondary CE range. 12207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s = rootElements.getSecondaryBoundary() - 0x100; 12217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16); 12227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if(!pIsTailored) { 12237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // p is a root primary. 12247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sLimit = rootElements.getSecondaryAfter(pIndex, s); 1225f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } else if(s == Collation.BEFORE_WEIGHT16) { 1226f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert sLimit = Collation.COMMON_WEIGHT16; 12277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 12287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // p is a tailored primary. 12297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(s == Collation.COMMON_WEIGHT16); 12307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sLimit = rootElements.getSecondaryBoundary(); 12317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(s == Collation.COMMON_WEIGHT16) { 12337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not tailor into the getSortKey() range of 12347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // compressed common secondaries. 12357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s = rootElements.getLastCommonSecondary(); 12367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert secondaries.initForSecondary(); 12387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!secondaries.allocWeights(s, sLimit, sCount)) { 12397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // C++ U_BUFFER_OVERFLOW_ERROR 1240f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert if(DEBUG) { 1241f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n", 1242f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert alignWeightRight(s), alignWeightRight(sLimit), 1243f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert alignWeightRight(sCount)); 1244f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert } 12457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException("secondary tailoring gap too small"); 12467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sIsTailored = true; 12487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s = (int)secondaries.nextWeight(); 12507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(s != 0xffffffff); 12517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 12527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s = weight16FromNode(node); 12537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sIsTailored = false; 12547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(DEBUG) { 12557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.printf(" sec %x\n", alignWeightRight(s)); 12567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else /* Collator.PRIMARY */ { 12597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(isTailoredNode(node)); 12607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(DEBUG) { 12617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.print("pri+ "); 12627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!pIsTailored) { 12647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // First tailored primary node in this list. 12657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pCount = countTailoredNodes(nodesArray, nextIndex, 12667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Collator.PRIMARY) + 1; 12677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean isCompressible = baseData.isCompressiblePrimary(p); 12687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long pLimit = 12697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert rootElements.getPrimaryAfter(p, pIndex, isCompressible); 12707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert primaries.initForPrimary(isCompressible); 12717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(!primaries.allocWeights(p, pLimit, pCount)) { 12727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // C++ U_BUFFER_OVERFLOW_ERROR // TODO: introduce a more specific UErrorCode? 12737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert throw new UnsupportedOperationException("primary tailoring gap too small"); 12747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pIsTailored = true; 12767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert p = primaries.nextWeight(); 12787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(p != 0xffffffffL); 12797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s = Collation.COMMON_WEIGHT16; 12807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert sIsTailored = false; 12817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert t = s == 0 ? 0 : Collation.COMMON_WEIGHT16; 12837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tIsTailored = false; 12847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert q = 0; 12867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTailoredNode(node)) { 12887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert nodesArray[i] = Collation.makeCE(p, s, t, q); 12897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(DEBUG) { 12907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert System.out.printf("%016x\n", nodesArray[i]); 12917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 12967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 12977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 12987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Counts the tailored nodes of the given strength up to the next node 12997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * which is either stronger or has an explicit weight of this strength. 13007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 13017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int countTailoredNodes(long[] nodesArray, int i, int strength) { 13027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int count = 0; 13037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for(;;) { 13047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(i == 0) { break; } 13057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long node = nodesArray[i]; 13067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strengthFromNode(node) < strength) { break; } 13077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(strengthFromNode(node) == strength) { 13087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(isTailoredNode(node)) { 13097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ++count; 13107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 13117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 13127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert i = nextIndexFromNode(node); 13157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return count; 13177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final class CEFinalizer implements CollationDataBuilder.CEModifier { 13207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEFinalizer(long[] ces) { 13217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert finalCEs = ces; 13227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public long modifyCE32(int ce32) { 13247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert(!Collation.isSpecialCE32(ce32)); 13257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationBuilder.isTempCE32(ce32)) { 13267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // retain case bits 13277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 13287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 13297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.NO_CE; 13307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public long modifyCE(long ce) { 13337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if(CollationBuilder.isTempCE(ce)) { 13347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // retain case bits 13357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000); 13367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 13377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return Collation.NO_CE; 13387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private long[] finalCEs; 13427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert }; 13437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** Replaces temporary CEs with the final CEs they point to. */ 13457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void finalizeCEs() { 13467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CollationDataBuilder newBuilder = new CollationDataBuilder(); 13477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newBuilder.initForTailoring(baseData); 13487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer()); 13497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert newBuilder.copyFrom(dataBuilder, finalizer); 13507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert dataBuilder = newBuilder; 13517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 13547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Encodes "temporary CE" data into a CE that fits into the CE32 data structure, 13557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * with 2-byte primary, 1-byte secondary and 6-bit tertiary, 13567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * with valid CE byte values. 13577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The index must not exceed 20 bits (0xfffff). 13597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY). 13607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Temporary CEs are distinguished from real CEs by their use of 13627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * secondary weights 06..45 which are otherwise reserved for compressed sort keys. 13637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 13647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The case bits are unused and available. 13657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 13667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long tempCEFromIndexAndStrength(int index, int strength) { 13677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 13687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // CE byte offsets, to ensure valid CE bytes, and case bits 11 13697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 0x4040000006002000L + 13707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF) 13717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((long)(index & 0xfe000) << 43) + 13727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF) 13737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((long)(index & 0x1fc0) << 42) + 13747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45) 13757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((index & 0x3f) << 24) + 13767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23) 13777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (strength << 8); 13787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int indexFromTempCE(long tempCE) { 13807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tempCE -= 0x4040000006002000L; 13817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 13827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((int)(tempCE >> 43) & 0xfe000) | 13837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((int)(tempCE >> 42) & 0x1fc0) | 13847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((int)(tempCE >> 24) & 0x3f); 13857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int strengthFromTempCE(long tempCE) { 13877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((int)tempCE >> 8) & 3; 13887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean isTempCE(long ce) { 13907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int sec = (int)ce >>> 24; 13917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 6 <= sec && sec <= 0x45; 13927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 13937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 13947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int indexFromTempCE32(int tempCE32) { 13957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tempCE32 -= 0x40400620; 13967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 13977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((tempCE32 >> 11) & 0xfe000) | 13987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((tempCE32 >> 10) & 0x1fc0) | 13997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((tempCE32 >> 8) & 0x3f); 14007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean isTempCE32(int ce32) { 14027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 14037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (ce32 & 0xff) >= 2 && // not a long-primary/long-secondary CE32 14047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45; 14057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int ceStrength(long ce) { 14087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 14097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert isTempCE(ce) ? strengthFromTempCE(ce) : 14107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY : 14117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY : 14127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce != 0 ? Collator.TERTIARY : 14137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Collator.IDENTICAL; 14147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** At most 1M nodes, limited by the 20 bits in node bit fields. */ 14177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int MAX_INDEX = 0xfffff; 14187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1419f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * Node bit 6 is set on a primary node if there are nodes 1420f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * with secondary values below the common secondary weight (05). 14217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 14227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int HAS_BEFORE2 = 0x40; 14237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1424f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * Node bit 5 is set on a primary or secondary node if there are nodes 1425f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * with tertiary values below the common tertiary weight (05). 14267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 14277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int HAS_BEFORE3 = 0x20; 14287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 14297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Node bit 3 distinguishes a tailored node, which has no weight value, 14307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * from a node with an explicit (root or default) weight. 14317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 14327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final int IS_TAILORED = 8; 14337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long nodeFromWeight32(long weight32) { 14357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return weight32 << 32; 14367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long nodeFromWeight16(int weight16) { 14387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (long)weight16 << 48; 14397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long nodeFromPreviousIndex(int previous) { 14417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (long)previous << 28; 14427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long nodeFromNextIndex(int next) { 14447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return next << 8; 14457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long nodeFromStrength(int strength) { 14477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return strength; 14487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long weight32FromNode(long node) { 14517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return node >>> 32; 14527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int weight16FromNode(long node) { 14547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (int)(node >> 48) & 0xffff; 14557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int previousIndexFromNode(long node) { 14577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (int)(node >> 28) & MAX_INDEX; 14587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int nextIndexFromNode(long node) { 14607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return ((int)node >> 8) & MAX_INDEX; 14617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static int strengthFromNode(long node) { 14637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (int)node & 3; 14647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean nodeHasBefore2(long node) { 14677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (node & HAS_BEFORE2) != 0; 14687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean nodeHasBefore3(long node) { 14707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (node & HAS_BEFORE3) != 0; 14717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean nodeHasAnyBefore(long node) { 14737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0; 14747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static boolean isTailoredNode(long node) { 14767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (node & IS_TAILORED) != 0; 14777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long changeNodePreviousIndex(long node, int previous) { 14807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous); 14817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static long changeNodeNextIndex(long node, int next) { 14837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next); 14847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 14857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Normalizer2 nfd, fcd; 14877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private Normalizer2Impl nfcImpl; 14887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationTailoring base; 14907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationData baseData; 14917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationRootElements rootElements; 14927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private long variableTop; 14937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationDataBuilder dataBuilder; 14957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private boolean fastLatinEnabled; 14967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet optimizeSet = new UnicodeSet(); 14977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 14987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH]; 14997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private int cesLength; 15007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 15017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Indexes of nodes with root primary weights, sorted by primary. 15037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Compact form of a TreeMap from root primary to node index. 15047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This is a performance optimization for finding reset positions. 15067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Without this, we would have to search through the entire nodes list. 15077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It also allows storing root primary weights in list head nodes, 15087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * without previous index, leaving room in root primary nodes for 32-bit primary weights. 15097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UVector32 rootPrimaryIndexes; 15117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 15127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Data structure for assigning tailored weights and CEs. 15137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Doubly-linked lists of nodes in mostly collation order. 15147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Each list starts with a root primary node and ends with a nextIndex of 0. 15157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * When there are any nodes in the list, then there is always a root primary node at index 0. 15177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * This allows some code not to have to check explicitly for nextIndex==0. 15187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Root primary nodes have 32-bit weights but do not have previous indexes. 15207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * All other nodes have at most 16-bit weights and do have previous indexes. 15217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Nodes with explicit weights store root collator weights, 15237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or default weak weights (e.g., secondary 05) for stronger nodes. 15247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * "Tailored" nodes, with the IS_TAILORED bit set, 15257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * do not store explicit weights but rather 15267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * create a difference of a certain strength from the preceding node. 15277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A root node is followed by either 15297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - a root/default node of the same strength, or 15307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - a root/default node of the next-weaker strength, or 15317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - a tailored node of the same strength. 15327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A node of a given strength normally implies "common" weights on weaker levels. 15347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A node with HAS_BEFORE2 must be immediately followed by 1536f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * a secondary node with an explicit below-common weight, then a secondary tailored node, 15377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and later an explicit common-secondary node. 1538f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * The below-common weight can be a root weight, 1539f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight 1540f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * or before the lowest root weight. 1541f716bda031dccdec5e47bb40e758c5901d209729Fredrik Roubert * (&[before 2] resets to an explicit secondary node so that 15427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the following addRelation(secondary) tailors right after that. 15437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If we did not have this node and instead were to reset on the primary node, 15447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.) 15457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If the flag is not set, then there are no explicit secondary nodes 15477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * with the common or lower weights. 15487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Same for HAS_BEFORE3 for tertiary nodes and weights. 15507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A node must not have both flags set. 15517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs 15537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * which point to stable indexes in this list, 15547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes. 15557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node, 15577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * until the next relation is added. 15587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * At the end, the tailored weights are allocated as necessary, 15607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * then the tailored nodes are replaced with final CEs, 15617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and the CollationData is rewritten by replacing temporary CEs with final ones. 15627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * We cannot simply insert new nodes in the middle of the array 15647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * because that would invalidate the indexes stored in existing temporary CEs. 15657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * We need to use a linked graph with stable indexes to existing nodes. 15667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A doubly-linked list seems easiest to maintain. 15677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Each node is stored as an long, with its fields stored as bit fields. 15697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Root primary node: 15717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - primary weight: 32 bits 63..32 15727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - reserved/unused/zero: 4 bits 31..28 15737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Weaker root nodes & tailored nodes: 15757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - a weight: 16 bits 63..48 15767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * + a root or default weight for a non-tailored node 15777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * + unused/zero for a tailored node 15787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - index to the previous node: 20 bits 47..28 15797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * All types of nodes: 15817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - index to the next node: 20 bits 27..8 15827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * + nextIndex=0 in last node per root-primary list 15837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - reserved/unused/zero bits: bits 7, 4, 2 15847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - HAS_BEFORE2: bit 6 15857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - HAS_BEFORE3: bit 5 15867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - IS_TAILORED: bit 3 15877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0 15887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 15897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * We could allocate structs with pointers, but we would have to store them 15907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in a pointer list so that they can be indexed from temporary CEs, 15917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * and they would require more memory allocations. 15927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 15937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UVector64 nodes; 15947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 1595