12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2013-2014, International Business Machines 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others. All Rights Reserved. 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* TailoredSet.java, ported from collationsets.h/.cpp 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* C++ version created on: 2013feb09 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* created by: Markus W. Scherer 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/ 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl.coll; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.Iterator; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl.Hangul; 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Trie2; 202d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubertimport com.ibm.icu.impl.Utility; 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.text.UnicodeSet; 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.CharsTrie; 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.CharsTrie.Entry; 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Finds the set of characters and strings that sort differently in the tailoring 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * from the base data. 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Every mapping in the tailoring needs to be compared to the base, 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * because some mappings are copied for optimization, and 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * all contractions for a character are copied if any contractions for that character 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * are added, modified or removed. 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It might be simpler to re-parse the rule string, but: 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - That would require duplicating some of the from-rules builder code. 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - That would make the runtime code depend on the builder. 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * - That would only work if we have the rule string, and we allow users to 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * omit the rule string from data files. 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic final class TailoredSet { 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationData data; 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private CollationData baseData; 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private UnicodeSet tailored; 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private StringBuilder unreversedPrefix = new StringBuilder(); 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private String suffix; 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public TailoredSet(UnicodeSet t) { 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored = t; 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void forData(CollationData d) { 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert data = d; 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseData = d.base; 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (baseData != null); 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // utrie2_enum(data->trie, NULL, enumTailoredRange, this); 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Iterator<Trie2.Range> trieIterator = data.trie.iterator(); 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Trie2.Range range; 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this); 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) { 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ce32 == Collation.FALLBACK_CE32) { 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; // fallback to base, not tailored 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts.handleCE32(start, end, ce32); 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J. 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Also, ICU4C requires handleCE32() to be public because it is used by the callback 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // function (enumTailoredRange()). This is not necessary for Java implementation. 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void handleCE32(int start, int end, int ce32) { 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (ce32 != Collation.FALLBACK_CE32); 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isSpecialCE32(ce32)) { 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.getIndirectCE32(ce32); 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ce32 == Collation.FALLBACK_CE32) { 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start)); 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Do not just continue if ce32 == baseCE32 because 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // contractions and expansions in different data objects 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // normally differ even if they have the same data offsets. 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) { 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // fastpath 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ce32 != baseCE32) { 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(start); 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compare(start, ce32, baseCE32); 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while (++start <= end); 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void compare(int c, int ce32, int baseCE32) { 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isPrefixCE32(ce32)) { 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int dataIndex = Collation.indexFromCE32(ce32); 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isPrefixCE32(baseCE32)) { 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseIndex = Collation.indexFromCE32(baseCE32); 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefixes(data, c, data.contexts, dataIndex + 2); 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (Collation.isPrefixCE32(baseCE32)) { 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseIndex = Collation.indexFromCE32(baseCE32); 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefixes(baseData, c, baseData.contexts, baseIndex + 2); 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isContractionCE32(ce32)) { 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int dataIndex = Collation.indexFromCE32(ce32); 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = Collation.NO_CE32; 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isContractionCE32(baseCE32)) { 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseIndex = Collation.indexFromCE32(baseCE32); 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = Collation.NO_CE32; 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addContractions(c, data.contexts, dataIndex + 2); 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (Collation.isContractionCE32(baseCE32)) { 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseIndex = Collation.indexFromCE32(baseCE32); 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addContractions(c, baseData.contexts, baseIndex + 2); 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int tag; 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isSpecialCE32(ce32)) { 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tag = Collation.tagFromCE32(ce32); 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (tag != Collation.PREFIX_TAG); 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (tag != Collation.CONTRACTION_TAG); 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Currently, the tailoring data builder does not write offset tags. 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // They might be useful for saving space, 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // but they would complicate the builder, 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // and in tailorings we assume that performance of tailored characters is more important. 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (tag != Collation.OFFSET_TAG); 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tag = -1; 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseTag; 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isSpecialCE32(baseCE32)) { 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseTag = Collation.tagFromCE32(baseCE32); 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (baseTag != Collation.PREFIX_TAG); 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (baseTag != Collation.CONTRACTION_TAG); 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert baseTag = -1; 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Non-contextual mappings, expansions, etc. 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (baseTag == Collation.OFFSET_TAG) { 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We might be comparing a tailoring CE which is a copy of 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // a base offset-tag CE, via the [optimize [set]] syntax 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // or when a single-character mapping was copied for tailored contractions. 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Offset tags always result in long-primary CEs, 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // with common secondary/tertiary weights. 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (!Collation.isLongPrimaryCE32(ce32)) { 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)]; 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.primaryFromLongPrimaryCE32(ce32) != p) { 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tag != baseTag) { 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tag == Collation.EXPANSION32_TAG) { 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = Collation.lengthFromCE32(ce32); 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseLength = Collation.lengthFromCE32(baseCE32); 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (length != baseLength) { 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx0 = Collation.indexFromCE32(ce32); 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx1 = Collation.indexFromCE32(baseCE32); 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < length; ++i) { 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) { 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (tag == Collation.EXPANSION_TAG) { 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = Collation.lengthFromCE32(ce32); 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int baseLength = Collation.lengthFromCE32(baseCE32); 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (length != baseLength) { 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return; 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx0 = Collation.indexFromCE32(ce32); 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx1 = Collation.indexFromCE32(baseCE32); 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < length; ++i) { 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) { 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (tag == Collation.HANGUL_TAG) { 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder jamos = new StringBuilder(); 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int length = Hangul.decompose(c, jamos); 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1)) 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert || (length == 3 && tailored.contains(jamos.charAt(2)))) { 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (ce32 != baseCE32) { 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert add(c); 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) { 2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Parallel iteration over prefixes of both tables. 2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); 2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator(); 2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String tp = null; // Tailoring prefix. 2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String bp = null; // Base prefix. 2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Use a string with a U+FFFF as the limit sentinel. 2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // U+FFFF is untailorable and will not occur in prefixes. 2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String none = "\uffff"; 2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Entry te = null, be = null; 2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;;) { 2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (tp == null) { 2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (prefixes.hasNext()) { 2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = prefixes.next(); 2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tp = te.chars.toString(); 2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = null; 2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tp = none; 2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (bp == null) { 2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (basePrefixes.hasNext()) { 2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = basePrefixes.next(); 2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bp = be.chars.toString(); 2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = null; 2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bp = none; 2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2622d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) { 2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cmp = tp.compareTo(bp); 2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cmp < 0) { 2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // tp occurs in the tailoring but not in the base. 2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (te != null); 2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefix(data, tp, c, te.value); 2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = null; 2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tp = null; 2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (cmp > 0) { 2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // bp occurs in the base but not in the tailoring. 2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (be != null); 2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefix(baseData, bp, c, be.value); 2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = null; 2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bp = null; 2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setPrefix(tp); 2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert assert (te != null && be != null); 2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compare(c, te.value, be.value); 2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert resetPrefix(); 2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = be = null; 2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tp = bp = null; 2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) { 2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Parallel iteration over suffixes of both tables. 2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); 2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator(); 2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String ts = null; // Tailoring suffix. 2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String bs = null; // Base suffix. 2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Use a string with two U+FFFF as the limit sentinel. 2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // U+FFFF is untailorable and will not occur in contractions except maybe 2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // as a single suffix character for a root-collator boundary contraction. 2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String none = "\uffff\uffff"; 2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Entry te = null, be = null; 3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;;) { 3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (ts == null) { 3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (suffixes.hasNext()) { 3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = suffixes.next(); 3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts = te.chars.toString(); 3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = null; 3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts = none; 3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (bs == null) { 3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (baseSuffixes.hasNext()) { 3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = baseSuffixes.next(); 3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bs = be.chars.toString(); 3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = null; 3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bs = none; 3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3192d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) { 3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int cmp = ts.compareTo(bs); 3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (cmp < 0) { 3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ts occurs in the tailoring but not in the base. 3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addSuffix(c, ts); 3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = null; 3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts = null; 3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else if (cmp > 0) { 3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // bs occurs in the base but not in the tailoring. 3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addSuffix(c, bs); 3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert be = null; 3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert bs = null; 3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert suffix = ts; 3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert compare(c, te.value, be.value); 3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert suffix = null; 3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert te = be = null; 3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ts = bs = null; 3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) { 3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); 3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (prefixes.hasNext()) { 3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Entry e = prefixes.next(); 3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addPrefix(d, e.chars, c, e.value); 3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) { 3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setPrefix(pfx); 3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ce32 = d.getFinalCE32(ce32); 3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (Collation.isContractionCE32(ce32)) { 3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int idx = Collation.indexFromCE32(ce32); 3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addContractions(c, d.contexts, idx + 2); 3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c))); 3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert resetPrefix(); 3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addContractions(int c, CharSequence p, int pidx) { 3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); 3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while (suffixes.hasNext()) { 3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert Entry e = suffixes.next(); 3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert addSuffix(c, e.chars); 3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void addSuffix(int c, CharSequence sfx) { 3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx)); 3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void add(int c) { 3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (unreversedPrefix.length() == 0 && suffix == null) { 3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(c); 3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert StringBuilder s = new StringBuilder(unreversedPrefix); 3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s.appendCodePoint(c); 3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (suffix != null) { 3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert s.append(suffix); 3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert tailored.add(s); 3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Prefixes are reversed in the data structure. 3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void setPrefix(CharSequence pfx) { 3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert unreversedPrefix.setLength(0); 3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert unreversedPrefix.append(pfx).reverse(); 3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private void resetPrefix() { 3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert unreversedPrefix.setLength(0); 3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 398