17935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 27935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2013-2014, International Business Machines 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others. All Rights Reserved. 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* TailoredSet.java, ported from collationsets.h/.cpp 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* C++ version created on: 2013feb09 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* created by: Markus W. Scherer 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl.coll; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.Iterator; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl.Hangul; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Trie2; 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSet; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.CharsTrie; 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.CharsTrie.Entry; 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Finds the set of characters and strings that sort differently in the tailoring 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * from the base data. 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Every mapping in the tailoring needs to be compared to the base, 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * because some mappings are copied for optimization, and 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * all contractions for a character are copied if any contractions for that character 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * are added, modified or removed. 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It might be simpler to re-parse the rule string, but: 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - That would require duplicating some of the from-rules builder code. 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - That would make the runtime code depend on the builder. 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - That would only work if we have the rule string, and we allow users to 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * omit the rule string from data files. 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class TailoredSet { 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationData data; 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationData baseData; 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet tailored; 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private StringBuilder unreversedPrefix = new StringBuilder(); 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String suffix; 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public TailoredSet(UnicodeSet t) { 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored = t; 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void forData(CollationData d) { 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert data = d; 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseData = d.base; 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (baseData != null); 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // utrie2_enum(data->trie, NULL, enumTailoredRange, this); 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator<Trie2.Range> trieIterator = data.trie.iterator(); 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Trie2.Range range; 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this); 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) { 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ce32 == Collation.FALLBACK_CE32) { 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; // fallback to base, not tailored 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts.handleCE32(start, end, ce32); 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J. 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Also, ICU4C requires handleCE32() to be public because it is used by the callback 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // function (enumTailoredRange()). This is not necessary for Java implementation. 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void handleCE32(int start, int end, int ce32) { 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (ce32 != Collation.FALLBACK_CE32); 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isSpecialCE32(ce32)) { 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.getIndirectCE32(ce32); 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ce32 == Collation.FALLBACK_CE32) { 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start)); 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not just continue if ce32 == baseCE32 because 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // contractions and expansions in different data objects 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // normally differ even if they have the same data offsets. 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) { 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fastpath 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ce32 != baseCE32) { 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(start); 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compare(start, ce32, baseCE32); 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while (++start <= end); 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void compare(int c, int ce32, int baseCE32) { 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isPrefixCE32(ce32)) { 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int dataIndex = Collation.indexFromCE32(ce32); 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isPrefixCE32(baseCE32)) { 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseIndex = Collation.indexFromCE32(baseCE32); 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefixes(data, c, data.contexts, dataIndex + 2); 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (Collation.isPrefixCE32(baseCE32)) { 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseIndex = Collation.indexFromCE32(baseCE32); 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefixes(baseData, c, baseData.contexts, baseIndex + 2); 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isContractionCE32(ce32)) { 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int dataIndex = Collation.indexFromCE32(ce32); 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = Collation.NO_CE32; 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isContractionCE32(baseCE32)) { 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseIndex = Collation.indexFromCE32(baseCE32); 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = Collation.NO_CE32; 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addContractions(c, data.contexts, dataIndex + 2); 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (Collation.isContractionCE32(baseCE32)) { 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseIndex = Collation.indexFromCE32(baseCE32); 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addContractions(c, baseData.contexts, baseIndex + 2); 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int tag; 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isSpecialCE32(ce32)) { 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tag = Collation.tagFromCE32(ce32); 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (tag != Collation.PREFIX_TAG); 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (tag != Collation.CONTRACTION_TAG); 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Currently, the tailoring data builder does not write offset tags. 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // They might be useful for saving space, 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but they would complicate the builder, 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and in tailorings we assume that performance of tailored characters is more important. 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (tag != Collation.OFFSET_TAG); 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tag = -1; 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseTag; 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isSpecialCE32(baseCE32)) { 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseTag = Collation.tagFromCE32(baseCE32); 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (baseTag != Collation.PREFIX_TAG); 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (baseTag != Collation.CONTRACTION_TAG); 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseTag = -1; 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Non-contextual mappings, expansions, etc. 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (baseTag == Collation.OFFSET_TAG) { 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We might be comparing a tailoring CE which is a copy of 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // a base offset-tag CE, via the [optimize [set]] syntax 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or when a single-character mapping was copied for tailored contractions. 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Offset tags always result in long-primary CEs, 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // with common secondary/tertiary weights. 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!Collation.isLongPrimaryCE32(ce32)) { 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)]; 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.primaryFromLongPrimaryCE32(ce32) != p) { 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tag != baseTag) { 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tag == Collation.EXPANSION32_TAG) { 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = Collation.lengthFromCE32(ce32); 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseLength = Collation.lengthFromCE32(baseCE32); 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (length != baseLength) { 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx0 = Collation.indexFromCE32(ce32); 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx1 = Collation.indexFromCE32(baseCE32); 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < length; ++i) { 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) { 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (tag == Collation.EXPANSION_TAG) { 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = Collation.lengthFromCE32(ce32); 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseLength = Collation.lengthFromCE32(baseCE32); 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (length != baseLength) { 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx0 = Collation.indexFromCE32(ce32); 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx1 = Collation.indexFromCE32(baseCE32); 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < length; ++i) { 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) { 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (tag == Collation.HANGUL_TAG) { 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder jamos = new StringBuilder(); 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = Hangul.decompose(c, jamos); 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1)) 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || (length == 3 && tailored.contains(jamos.charAt(2)))) { 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (ce32 != baseCE32) { 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) { 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Parallel iteration over prefixes of both tables. 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator(); 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String tp = null; // Tailoring prefix. 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String bp = null; // Base prefix. 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Use a string with a U+FFFF as the limit sentinel. 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // U+FFFF is untailorable and will not occur in prefixes. 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String none = "\uffff"; 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Entry te = null, be = null; 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;;) { 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tp == null) { 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (prefixes.hasNext()) { 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = prefixes.next(); 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tp = te.chars.toString(); 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = null; 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tp = none; 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (bp == null) { 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (basePrefixes.hasNext()) { 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = basePrefixes.next(); 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bp = be.chars.toString(); 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = null; 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bp = none; 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tp == none && bp == none) { 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cmp = tp.compareTo(bp); 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cmp < 0) { 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // tp occurs in the tailoring but not in the base. 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (te != null); 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefix(data, tp, c, te.value); 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = null; 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tp = null; 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (cmp > 0) { 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // bp occurs in the base but not in the tailoring. 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (be != null); 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefix(baseData, bp, c, be.value); 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = null; 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bp = null; 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setPrefix(tp); 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (te != null && be != null); 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compare(c, te.value, be.value); 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert resetPrefix(); 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = be = null; 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tp = bp = null; 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) { 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Parallel iteration over suffixes of both tables. 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator(); 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String ts = null; // Tailoring suffix. 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String bs = null; // Base suffix. 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Use a string with two U+FFFF as the limit sentinel. 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // U+FFFF is untailorable and will not occur in contractions except maybe 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // as a single suffix character for a root-collator boundary contraction. 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String none = "\uffff\uffff"; 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Entry te = null, be = null; 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;;) { 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ts == null) { 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (suffixes.hasNext()) { 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = suffixes.next(); 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts = te.chars.toString(); 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = null; 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts = none; 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (bs == null) { 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (baseSuffixes.hasNext()) { 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = baseSuffixes.next(); 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bs = be.chars.toString(); 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = null; 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bs = none; 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ts == none && bs == none) { 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cmp = ts.compareTo(bs); 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cmp < 0) { 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ts occurs in the tailoring but not in the base. 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addSuffix(c, ts); 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = null; 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts = null; 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (cmp > 0) { 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // bs occurs in the base but not in the tailoring. 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addSuffix(c, bs); 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = null; 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bs = null; 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert suffix = ts; 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compare(c, te.value, be.value); 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert suffix = null; 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = be = null; 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts = bs = null; 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) { 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (prefixes.hasNext()) { 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Entry e = prefixes.next(); 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefix(d, e.chars, c, e.value); 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) { 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setPrefix(pfx); 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.getFinalCE32(ce32); 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isContractionCE32(ce32)) { 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx = Collation.indexFromCE32(ce32); 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addContractions(c, d.contexts, idx + 2); 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c))); 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert resetPrefix(); 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addContractions(int c, CharSequence p, int pidx) { 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (suffixes.hasNext()) { 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Entry e = suffixes.next(); 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addSuffix(c, e.chars); 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addSuffix(int c, CharSequence sfx) { 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx)); 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void add(int c) { 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (unreversedPrefix.length() == 0 && suffix == null) { 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(c); 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder s = new StringBuilder(unreversedPrefix); 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s.appendCodePoint(c); 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (suffix != null) { 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s.append(suffix); 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(s); 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Prefixes are reversed in the data structure. 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void setPrefix(CharSequence pfx) { 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert unreversedPrefix.setLength(0); 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert unreversedPrefix.append(pfx).reverse(); 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void resetPrefix() { 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert unreversedPrefix.setLength(0); 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 395