1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2013-2014, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* TailoredSet.java, ported from collationsets.h/.cpp
9*
10* C++ version created on: 2013feb09
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import java.util.Iterator;
17
18import com.ibm.icu.impl.Normalizer2Impl.Hangul;
19import com.ibm.icu.impl.Trie2;
20import com.ibm.icu.impl.Utility;
21import com.ibm.icu.text.UnicodeSet;
22import com.ibm.icu.util.CharsTrie;
23import com.ibm.icu.util.CharsTrie.Entry;
24
25/**
26 * Finds the set of characters and strings that sort differently in the tailoring
27 * from the base data.
28 *
29 * Every mapping in the tailoring needs to be compared to the base,
30 * because some mappings are copied for optimization, and
31 * all contractions for a character are copied if any contractions for that character
32 * are added, modified or removed.
33 *
34 * It might be simpler to re-parse the rule string, but:
35 * - That would require duplicating some of the from-rules builder code.
36 * - That would make the runtime code depend on the builder.
37 * - That would only work if we have the rule string, and we allow users to
38 *   omit the rule string from data files.
39 */
40public final class TailoredSet {
41
42    private CollationData data;
43    private CollationData baseData;
44    private UnicodeSet tailored;
45    private StringBuilder unreversedPrefix = new StringBuilder();
46    private String suffix;
47
48    public TailoredSet(UnicodeSet t) {
49        tailored = t;
50    }
51
52    public void forData(CollationData d) {
53        data = d;
54        baseData = d.base;
55        assert (baseData != null);
56        // utrie2_enum(data->trie, NULL, enumTailoredRange, this);
57        Iterator<Trie2.Range> trieIterator = data.trie.iterator();
58        Trie2.Range range;
59        while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
60            enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this);
61        }
62    }
63
64    private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) {
65        if (ce32 == Collation.FALLBACK_CE32) {
66            return; // fallback to base, not tailored
67        }
68        ts.handleCE32(start, end, ce32);
69    }
70
71    // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J.
72    //  Also, ICU4C requires handleCE32() to be public because it is used by the callback
73    //  function (enumTailoredRange()). This is not necessary for Java implementation.
74    private void handleCE32(int start, int end, int ce32) {
75        assert (ce32 != Collation.FALLBACK_CE32);
76        if (Collation.isSpecialCE32(ce32)) {
77            ce32 = data.getIndirectCE32(ce32);
78            if (ce32 == Collation.FALLBACK_CE32) {
79                return;
80            }
81        }
82        do {
83            int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start));
84            // Do not just continue if ce32 == baseCE32 because
85            // contractions and expansions in different data objects
86            // normally differ even if they have the same data offsets.
87            if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) {
88                // fastpath
89                if (ce32 != baseCE32) {
90                    tailored.add(start);
91                }
92            } else {
93                compare(start, ce32, baseCE32);
94            }
95        } while (++start <= end);
96    }
97
98    private void compare(int c, int ce32, int baseCE32) {
99        if (Collation.isPrefixCE32(ce32)) {
100            int dataIndex = Collation.indexFromCE32(ce32);
101            ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
102            if (Collation.isPrefixCE32(baseCE32)) {
103                int baseIndex = Collation.indexFromCE32(baseCE32);
104                baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
105                comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
106            } else {
107                addPrefixes(data, c, data.contexts, dataIndex + 2);
108            }
109        } else if (Collation.isPrefixCE32(baseCE32)) {
110            int baseIndex = Collation.indexFromCE32(baseCE32);
111            baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
112            addPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
113        }
114
115        if (Collation.isContractionCE32(ce32)) {
116            int dataIndex = Collation.indexFromCE32(ce32);
117            if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
118                ce32 = Collation.NO_CE32;
119            } else {
120                ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
121            }
122            if (Collation.isContractionCE32(baseCE32)) {
123                int baseIndex = Collation.indexFromCE32(baseCE32);
124                if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
125                    baseCE32 = Collation.NO_CE32;
126                } else {
127                    baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
128                }
129                compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
130            } else {
131                addContractions(c, data.contexts, dataIndex + 2);
132            }
133        } else if (Collation.isContractionCE32(baseCE32)) {
134            int baseIndex = Collation.indexFromCE32(baseCE32);
135            baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
136            addContractions(c, baseData.contexts, baseIndex + 2);
137        }
138
139        int tag;
140        if (Collation.isSpecialCE32(ce32)) {
141            tag = Collation.tagFromCE32(ce32);
142            assert (tag != Collation.PREFIX_TAG);
143            assert (tag != Collation.CONTRACTION_TAG);
144            // Currently, the tailoring data builder does not write offset tags.
145            // They might be useful for saving space,
146            // but they would complicate the builder,
147            // and in tailorings we assume that performance of tailored characters is more important.
148            assert (tag != Collation.OFFSET_TAG);
149        } else {
150            tag = -1;
151        }
152        int baseTag;
153        if (Collation.isSpecialCE32(baseCE32)) {
154            baseTag = Collation.tagFromCE32(baseCE32);
155            assert (baseTag != Collation.PREFIX_TAG);
156            assert (baseTag != Collation.CONTRACTION_TAG);
157        } else {
158            baseTag = -1;
159        }
160
161        // Non-contextual mappings, expansions, etc.
162        if (baseTag == Collation.OFFSET_TAG) {
163            // We might be comparing a tailoring CE which is a copy of
164            // a base offset-tag CE, via the [optimize [set]] syntax
165            // or when a single-character mapping was copied for tailored contractions.
166            // Offset tags always result in long-primary CEs,
167            // with common secondary/tertiary weights.
168            if (!Collation.isLongPrimaryCE32(ce32)) {
169                add(c);
170                return;
171            }
172            long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)];
173            long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
174            if (Collation.primaryFromLongPrimaryCE32(ce32) != p) {
175                add(c);
176                return;
177            }
178        }
179
180        if (tag != baseTag) {
181            add(c);
182            return;
183        }
184
185        if (tag == Collation.EXPANSION32_TAG) {
186            int length = Collation.lengthFromCE32(ce32);
187            int baseLength = Collation.lengthFromCE32(baseCE32);
188
189            if (length != baseLength) {
190                add(c);
191                return;
192            }
193
194            int idx0 = Collation.indexFromCE32(ce32);
195            int idx1 = Collation.indexFromCE32(baseCE32);
196
197            for (int i = 0; i < length; ++i) {
198                if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) {
199                    add(c);
200                    break;
201                }
202            }
203        } else if (tag == Collation.EXPANSION_TAG) {
204            int length = Collation.lengthFromCE32(ce32);
205            int baseLength = Collation.lengthFromCE32(baseCE32);
206
207            if (length != baseLength) {
208                add(c);
209                return;
210            }
211
212            int idx0 = Collation.indexFromCE32(ce32);
213            int idx1 = Collation.indexFromCE32(baseCE32);
214
215            for (int i = 0; i < length; ++i) {
216                if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) {
217                    add(c);
218                    break;
219                }
220            }
221        } else if (tag == Collation.HANGUL_TAG) {
222            StringBuilder jamos = new StringBuilder();
223            int length = Hangul.decompose(c, jamos);
224            if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1))
225                    || (length == 3 && tailored.contains(jamos.charAt(2)))) {
226                add(c);
227            }
228        } else if (ce32 != baseCE32) {
229            add(c);
230        }
231    }
232
233    private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
234        // Parallel iteration over prefixes of both tables.
235        CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
236        CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator();
237        String tp = null; // Tailoring prefix.
238        String bp = null; // Base prefix.
239        // Use a string with a U+FFFF as the limit sentinel.
240        // U+FFFF is untailorable and will not occur in prefixes.
241        String none = "\uffff";
242        Entry te = null, be = null;
243        for (;;) {
244            if (tp == null) {
245                if (prefixes.hasNext()) {
246                    te = prefixes.next();
247                    tp = te.chars.toString();
248                } else {
249                    te = null;
250                    tp = none;
251                }
252            }
253            if (bp == null) {
254                if (basePrefixes.hasNext()) {
255                    be = basePrefixes.next();
256                    bp = be.chars.toString();
257                } else {
258                    be = null;
259                    bp = none;
260                }
261            }
262            if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) {
263                break;
264            }
265            int cmp = tp.compareTo(bp);
266            if (cmp < 0) {
267                // tp occurs in the tailoring but not in the base.
268                assert (te != null);
269                addPrefix(data, tp, c, te.value);
270                te = null;
271                tp = null;
272            } else if (cmp > 0) {
273                // bp occurs in the base but not in the tailoring.
274                assert (be != null);
275                addPrefix(baseData, bp, c, be.value);
276                be = null;
277                bp = null;
278            } else {
279                setPrefix(tp);
280                assert (te != null && be != null);
281                compare(c, te.value, be.value);
282                resetPrefix();
283                te = be = null;
284                tp = bp = null;
285            }
286        }
287    }
288
289    private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
290        // Parallel iteration over suffixes of both tables.
291        CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
292        CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator();
293        String ts = null; // Tailoring suffix.
294        String bs = null; // Base suffix.
295        // Use a string with two U+FFFF as the limit sentinel.
296        // U+FFFF is untailorable and will not occur in contractions except maybe
297        // as a single suffix character for a root-collator boundary contraction.
298        String none = "\uffff\uffff";
299        Entry te = null, be = null;
300        for (;;) {
301            if (ts == null) {
302                if (suffixes.hasNext()) {
303                    te = suffixes.next();
304                    ts = te.chars.toString();
305                } else {
306                    te = null;
307                    ts = none;
308                }
309            }
310            if (bs == null) {
311                if (baseSuffixes.hasNext()) {
312                    be = baseSuffixes.next();
313                    bs = be.chars.toString();
314                } else {
315                    be = null;
316                    bs = none;
317                }
318            }
319            if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) {
320                break;
321            }
322            int cmp = ts.compareTo(bs);
323            if (cmp < 0) {
324                // ts occurs in the tailoring but not in the base.
325                addSuffix(c, ts);
326                te = null;
327                ts = null;
328            } else if (cmp > 0) {
329                // bs occurs in the base but not in the tailoring.
330                addSuffix(c, bs);
331                be = null;
332                bs = null;
333            } else {
334                suffix = ts;
335                compare(c, te.value, be.value);
336                suffix = null;
337                te = be = null;
338                ts = bs = null;
339            }
340        }
341    }
342
343    private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) {
344        CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
345        while (prefixes.hasNext()) {
346            Entry e = prefixes.next();
347            addPrefix(d, e.chars, c, e.value);
348        }
349    }
350
351    private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) {
352        setPrefix(pfx);
353        ce32 = d.getFinalCE32(ce32);
354        if (Collation.isContractionCE32(ce32)) {
355            int idx = Collation.indexFromCE32(ce32);
356            addContractions(c, d.contexts, idx + 2);
357        }
358        tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c)));
359        resetPrefix();
360    }
361
362    private void addContractions(int c, CharSequence p, int pidx) {
363        CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
364        while (suffixes.hasNext()) {
365            Entry e = suffixes.next();
366            addSuffix(c, e.chars);
367        }
368    }
369
370    private void addSuffix(int c, CharSequence sfx) {
371        tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx));
372    }
373
374    private void add(int c) {
375        if (unreversedPrefix.length() == 0 && suffix == null) {
376            tailored.add(c);
377        } else {
378            StringBuilder s = new StringBuilder(unreversedPrefix);
379            s.appendCodePoint(c);
380            if (suffix != null) {
381                s.append(suffix);
382            }
383            tailored.add(s);
384        }
385    }
386
387    // Prefixes are reversed in the data structure.
388    private void setPrefix(CharSequence pfx) {
389        unreversedPrefix.setLength(0);
390        unreversedPrefix.append(pfx).reverse();
391    }
392
393    private void resetPrefix() {
394        unreversedPrefix.setLength(0);
395    }
396}
397
398