// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html#License /* ******************************************************************************* * Copyright (C) 2013-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * TailoredSet.java, ported from collationsets.h/.cpp * * C++ version created on: 2013feb09 * created by: Markus W. Scherer */ package com.ibm.icu.impl.coll; import java.util.Iterator; import com.ibm.icu.impl.Normalizer2Impl.Hangul; import com.ibm.icu.impl.Trie2; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.CharsTrie; import com.ibm.icu.util.CharsTrie.Entry; /** * Finds the set of characters and strings that sort differently in the tailoring * from the base data. * * Every mapping in the tailoring needs to be compared to the base, * because some mappings are copied for optimization, and * all contractions for a character are copied if any contractions for that character * are added, modified or removed. * * It might be simpler to re-parse the rule string, but: * - That would require duplicating some of the from-rules builder code. * - That would make the runtime code depend on the builder. * - That would only work if we have the rule string, and we allow users to * omit the rule string from data files. */ public final class TailoredSet { private CollationData data; private CollationData baseData; private UnicodeSet tailored; private StringBuilder unreversedPrefix = new StringBuilder(); private String suffix; public TailoredSet(UnicodeSet t) { tailored = t; } public void forData(CollationData d) { data = d; baseData = d.base; assert (baseData != null); // utrie2_enum(data->trie, NULL, enumTailoredRange, this); Iterator trieIterator = data.trie.iterator(); Trie2.Range range; while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this); } } private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) { if (ce32 == Collation.FALLBACK_CE32) { return; // fallback to base, not tailored } ts.handleCE32(start, end, ce32); } // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J. // Also, ICU4C requires handleCE32() to be public because it is used by the callback // function (enumTailoredRange()). This is not necessary for Java implementation. private void handleCE32(int start, int end, int ce32) { assert (ce32 != Collation.FALLBACK_CE32); if (Collation.isSpecialCE32(ce32)) { ce32 = data.getIndirectCE32(ce32); if (ce32 == Collation.FALLBACK_CE32) { return; } } do { int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start)); // Do not just continue if ce32 == baseCE32 because // contractions and expansions in different data objects // normally differ even if they have the same data offsets. if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) { // fastpath if (ce32 != baseCE32) { tailored.add(start); } } else { compare(start, ce32, baseCE32); } } while (++start <= end); } private void compare(int c, int ce32, int baseCE32) { if (Collation.isPrefixCE32(ce32)) { int dataIndex = Collation.indexFromCE32(ce32); ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); if (Collation.isPrefixCE32(baseCE32)) { int baseIndex = Collation.indexFromCE32(baseCE32); baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); } else { addPrefixes(data, c, data.contexts, dataIndex + 2); } } else if (Collation.isPrefixCE32(baseCE32)) { int baseIndex = Collation.indexFromCE32(baseCE32); baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); addPrefixes(baseData, c, baseData.contexts, baseIndex + 2); } if (Collation.isContractionCE32(ce32)) { int dataIndex = Collation.indexFromCE32(ce32); if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { ce32 = Collation.NO_CE32; } else { ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); } if (Collation.isContractionCE32(baseCE32)) { int baseIndex = Collation.indexFromCE32(baseCE32); if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { baseCE32 = Collation.NO_CE32; } else { baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); } compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); } else { addContractions(c, data.contexts, dataIndex + 2); } } else if (Collation.isContractionCE32(baseCE32)) { int baseIndex = Collation.indexFromCE32(baseCE32); baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); addContractions(c, baseData.contexts, baseIndex + 2); } int tag; if (Collation.isSpecialCE32(ce32)) { tag = Collation.tagFromCE32(ce32); assert (tag != Collation.PREFIX_TAG); assert (tag != Collation.CONTRACTION_TAG); // Currently, the tailoring data builder does not write offset tags. // They might be useful for saving space, // but they would complicate the builder, // and in tailorings we assume that performance of tailored characters is more important. assert (tag != Collation.OFFSET_TAG); } else { tag = -1; } int baseTag; if (Collation.isSpecialCE32(baseCE32)) { baseTag = Collation.tagFromCE32(baseCE32); assert (baseTag != Collation.PREFIX_TAG); assert (baseTag != Collation.CONTRACTION_TAG); } else { baseTag = -1; } // Non-contextual mappings, expansions, etc. if (baseTag == Collation.OFFSET_TAG) { // We might be comparing a tailoring CE which is a copy of // a base offset-tag CE, via the [optimize [set]] syntax // or when a single-character mapping was copied for tailored contractions. // Offset tags always result in long-primary CEs, // with common secondary/tertiary weights. if (!Collation.isLongPrimaryCE32(ce32)) { add(c); return; } long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)]; long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); if (Collation.primaryFromLongPrimaryCE32(ce32) != p) { add(c); return; } } if (tag != baseTag) { add(c); return; } if (tag == Collation.EXPANSION32_TAG) { int length = Collation.lengthFromCE32(ce32); int baseLength = Collation.lengthFromCE32(baseCE32); if (length != baseLength) { add(c); return; } int idx0 = Collation.indexFromCE32(ce32); int idx1 = Collation.indexFromCE32(baseCE32); for (int i = 0; i < length; ++i) { if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) { add(c); break; } } } else if (tag == Collation.EXPANSION_TAG) { int length = Collation.lengthFromCE32(ce32); int baseLength = Collation.lengthFromCE32(baseCE32); if (length != baseLength) { add(c); return; } int idx0 = Collation.indexFromCE32(ce32); int idx1 = Collation.indexFromCE32(baseCE32); for (int i = 0; i < length; ++i) { if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) { add(c); break; } } } else if (tag == Collation.HANGUL_TAG) { StringBuilder jamos = new StringBuilder(); int length = Hangul.decompose(c, jamos); if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1)) || (length == 3 && tailored.contains(jamos.charAt(2)))) { add(c); } } else if (ce32 != baseCE32) { add(c); } } private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) { // Parallel iteration over prefixes of both tables. CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator(); String tp = null; // Tailoring prefix. String bp = null; // Base prefix. // Use a string with a U+FFFF as the limit sentinel. // U+FFFF is untailorable and will not occur in prefixes. String none = "\uffff"; Entry te = null, be = null; for (;;) { if (tp == null) { if (prefixes.hasNext()) { te = prefixes.next(); tp = te.chars.toString(); } else { te = null; tp = none; } } if (bp == null) { if (basePrefixes.hasNext()) { be = basePrefixes.next(); bp = be.chars.toString(); } else { be = null; bp = none; } } if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) { break; } int cmp = tp.compareTo(bp); if (cmp < 0) { // tp occurs in the tailoring but not in the base. assert (te != null); addPrefix(data, tp, c, te.value); te = null; tp = null; } else if (cmp > 0) { // bp occurs in the base but not in the tailoring. assert (be != null); addPrefix(baseData, bp, c, be.value); be = null; bp = null; } else { setPrefix(tp); assert (te != null && be != null); compare(c, te.value, be.value); resetPrefix(); te = be = null; tp = bp = null; } } } private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) { // Parallel iteration over suffixes of both tables. CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator(); String ts = null; // Tailoring suffix. String bs = null; // Base suffix. // Use a string with two U+FFFF as the limit sentinel. // U+FFFF is untailorable and will not occur in contractions except maybe // as a single suffix character for a root-collator boundary contraction. String none = "\uffff\uffff"; Entry te = null, be = null; for (;;) { if (ts == null) { if (suffixes.hasNext()) { te = suffixes.next(); ts = te.chars.toString(); } else { te = null; ts = none; } } if (bs == null) { if (baseSuffixes.hasNext()) { be = baseSuffixes.next(); bs = be.chars.toString(); } else { be = null; bs = none; } } if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) { break; } int cmp = ts.compareTo(bs); if (cmp < 0) { // ts occurs in the tailoring but not in the base. addSuffix(c, ts); te = null; ts = null; } else if (cmp > 0) { // bs occurs in the base but not in the tailoring. addSuffix(c, bs); be = null; bs = null; } else { suffix = ts; compare(c, te.value, be.value); suffix = null; te = be = null; ts = bs = null; } } } private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) { CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); while (prefixes.hasNext()) { Entry e = prefixes.next(); addPrefix(d, e.chars, c, e.value); } } private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) { setPrefix(pfx); ce32 = d.getFinalCE32(ce32); if (Collation.isContractionCE32(ce32)) { int idx = Collation.indexFromCE32(ce32); addContractions(c, d.contexts, idx + 2); } tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c))); resetPrefix(); } private void addContractions(int c, CharSequence p, int pidx) { CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); while (suffixes.hasNext()) { Entry e = suffixes.next(); addSuffix(c, e.chars); } } private void addSuffix(int c, CharSequence sfx) { tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx)); } private void add(int c) { if (unreversedPrefix.length() == 0 && suffix == null) { tailored.add(c); } else { StringBuilder s = new StringBuilder(unreversedPrefix); s.appendCodePoint(c); if (suffix != null) { s.append(suffix); } tailored.add(s); } } // Prefixes are reversed in the data structure. private void setPrefix(CharSequence pfx) { unreversedPrefix.setLength(0); unreversedPrefix.append(pfx).reverse(); } private void resetPrefix() { unreversedPrefix.setLength(0); } }