1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2013-2014, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* ContractionsAndExpansions.java, ported from collationsets.h/.cpp
9*
10* C++ version created on: 2013feb09
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import java.util.Iterator;
17
18import com.ibm.icu.impl.Trie2;
19import com.ibm.icu.text.UnicodeSet;
20import com.ibm.icu.util.CharsTrie;
21import com.ibm.icu.util.CharsTrie.Entry;
22
23public final class ContractionsAndExpansions {
24    // C++: The following fields are @internal, only public for access by callback.
25    private CollationData data;
26    private UnicodeSet contractions;
27    private UnicodeSet expansions;
28    private CESink sink;
29    private boolean addPrefixes;
30    private int checkTailored = 0;  // -1: collected tailored  +1: exclude tailored
31    private UnicodeSet tailored = new UnicodeSet();
32    private UnicodeSet ranges;
33    private StringBuilder unreversedPrefix = new StringBuilder();
34    private String suffix;
35    private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
36
37    public static interface CESink {
38        void handleCE(long ce);
39        void handleExpansion(long ces[], int start, int length);
40    }
41
42    public ContractionsAndExpansions(UnicodeSet con, UnicodeSet exp, CESink s, boolean prefixes) {
43        contractions = con;
44        expansions = exp;
45        sink = s;
46        addPrefixes = prefixes;
47    }
48
49    public void forData(CollationData d) {
50        // Add all from the data, can be tailoring or base.
51        if (d.base != null) {
52            checkTailored = -1;
53        }
54        data = d;
55        Iterator<Trie2.Range> trieIterator = data.trie.iterator();
56        Trie2.Range range;
57        while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
58            enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
59        }
60        if (d.base == null) {
61            return;
62        }
63        // Add all from the base data but only for un-tailored code points.
64        tailored.freeze();
65        checkTailored = 1;
66        data = d.base;
67        trieIterator = data.trie.iterator();
68        while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
69            enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
70        }
71    }
72
73    private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
74        if (cne.checkTailored == 0) {
75            // There is no tailoring.
76            // No need to collect nor check the tailored set.
77        } else if (cne.checkTailored < 0) {
78            // Collect the set of code points with mappings in the tailoring data.
79            if (ce32 == Collation.FALLBACK_CE32) {
80                return; // fallback to base, not tailored
81            } else {
82                cne.tailored.add(start, end);
83            }
84            // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
85        } else if (start == end) {
86            if (cne.tailored.contains(start)) {
87                return;
88            }
89        } else if (cne.tailored.containsSome(start, end)) {
90            if (cne.ranges == null) {
91                cne.ranges = new UnicodeSet();
92            }
93            cne.ranges.set(start, end).removeAll(cne.tailored);
94            int count = cne.ranges.getRangeCount();
95            for (int i = 0; i < count; ++i) {
96                cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
97            }
98        }
99        cne.handleCE32(start, end, ce32);
100    }
101
102    public void forCodePoint(CollationData d, int c) {
103        int ce32 = d.getCE32(c);
104        if (ce32 == Collation.FALLBACK_CE32) {
105            d = d.base;
106            ce32 = d.getCE32(c);
107        }
108        data = d;
109        handleCE32(c, c, ce32);
110    }
111
112    private void handleCE32(int start, int end, int ce32) {
113        for (;;) {
114            if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) {
115                // !isSpecialCE32()
116                if (sink != null) {
117                    sink.handleCE(Collation.ceFromSimpleCE32(ce32));
118                }
119                return;
120            }
121            switch (Collation.tagFromCE32(ce32)) {
122            case Collation.FALLBACK_TAG:
123                return;
124            case Collation.RESERVED_TAG_3:
125            case Collation.BUILDER_DATA_TAG:
126            case Collation.LEAD_SURROGATE_TAG:
127                // Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C.
128                throw new AssertionError(
129                        String.format("Unexpected CE32 tag type %d for ce32=0x%08x",
130                                Collation.tagFromCE32(ce32), ce32));
131            case Collation.LONG_PRIMARY_TAG:
132                if (sink != null) {
133                    sink.handleCE(Collation.ceFromLongPrimaryCE32(ce32));
134                }
135                return;
136            case Collation.LONG_SECONDARY_TAG:
137                if (sink != null) {
138                    sink.handleCE(Collation.ceFromLongSecondaryCE32(ce32));
139                }
140                return;
141            case Collation.LATIN_EXPANSION_TAG:
142                if (sink != null) {
143                    ces[0] = Collation.latinCE0FromCE32(ce32);
144                    ces[1] = Collation.latinCE1FromCE32(ce32);
145                    sink.handleExpansion(ces, 0, 2);
146                }
147                // Optimization: If we have a prefix,
148                // then the relevant strings have been added already.
149                if (unreversedPrefix.length() == 0) {
150                    addExpansions(start, end);
151                }
152                return;
153            case Collation.EXPANSION32_TAG:
154                if (sink != null) {
155                    int idx = Collation.indexFromCE32(ce32);
156                    int length = Collation.lengthFromCE32(ce32);
157                    for (int i = 0; i < length; ++i) {
158                        ces[i] = Collation.ceFromCE32(data.ce32s[idx + i]);
159                    }
160                    sink.handleExpansion(ces, 0, length);
161                }
162                // Optimization: If we have a prefix,
163                // then the relevant strings have been added already.
164                if (unreversedPrefix.length() == 0) {
165                    addExpansions(start, end);
166                }
167                return;
168            case Collation.EXPANSION_TAG:
169                if (sink != null) {
170                    int idx = Collation.indexFromCE32(ce32);
171                    int length = Collation.lengthFromCE32(ce32);
172                    sink.handleExpansion(data.ces, idx, length);
173                }
174                // Optimization: If we have a prefix,
175                // then the relevant strings have been added already.
176                if (unreversedPrefix.length() == 0) {
177                    addExpansions(start, end);
178                }
179                return;
180            case Collation.PREFIX_TAG:
181                handlePrefixes(start, end, ce32);
182                return;
183            case Collation.CONTRACTION_TAG:
184                handleContractions(start, end, ce32);
185                return;
186            case Collation.DIGIT_TAG:
187                // Fetch the non-numeric-collation CE32 and continue.
188                ce32 = data.ce32s[Collation.indexFromCE32(ce32)];
189                break;
190            case Collation.U0000_TAG:
191                assert (start == 0 && end == 0);
192                // Fetch the normal ce32 for U+0000 and continue.
193                ce32 = data.ce32s[0];
194                break;
195            case Collation.HANGUL_TAG:
196                if (sink != null) {
197                    // TODO: This should be optimized,
198                    // especially if [start..end] is the complete Hangul range. (assert that)
199                    UTF16CollationIterator iter = new UTF16CollationIterator(data);
200                    StringBuilder hangul = new StringBuilder(1);
201                    for (int c = start; c <= end; ++c) {
202                        hangul.setLength(0);
203                        hangul.appendCodePoint(c);
204                        iter.setText(false, hangul, 0);
205                        int length = iter.fetchCEs();
206                        // Ignore the terminating non-CE.
207                        assert (length >= 2 && iter.getCE(length - 1) == Collation.NO_CE);
208                        sink.handleExpansion(iter.getCEs(), 0, length - 1);
209                    }
210                }
211                // Optimization: If we have a prefix,
212                // then the relevant strings have been added already.
213                if (unreversedPrefix.length() == 0) {
214                    addExpansions(start, end);
215                }
216                return;
217            case Collation.OFFSET_TAG:
218                // Currently no need to send offset CEs to the sink.
219                return;
220            case Collation.IMPLICIT_TAG:
221                // Currently no need to send implicit CEs to the sink.
222                return;
223            }
224        }
225    }
226
227    private void handlePrefixes(int start, int end, int ce32) {
228        int index = Collation.indexFromCE32(ce32);
229        ce32 = data.getCE32FromContexts(index); // Default if no prefix match.
230        handleCE32(start, end, ce32);
231        if (!addPrefixes) {
232            return;
233        }
234        CharsTrie.Iterator prefixes = new CharsTrie(data.contexts, index + 2).iterator();
235        while (prefixes.hasNext()) {
236            Entry e = prefixes.next();
237            setPrefix(e.chars);
238            // Prefix/pre-context mappings are special kinds of contractions
239            // that always yield expansions.
240            addStrings(start, end, contractions);
241            addStrings(start, end, expansions);
242            handleCE32(start, end, e.value);
243        }
244        resetPrefix();
245    }
246
247    void handleContractions(int start, int end, int ce32) {
248        int index = Collation.indexFromCE32(ce32);
249        if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
250            // No match on the single code point.
251            // We are underneath a prefix, and the default mapping is just
252            // a fallback to the mappings for a shorter prefix.
253            assert (unreversedPrefix.length() != 0);
254        } else {
255            ce32 = data.getCE32FromContexts(index); // Default if no suffix match.
256            assert (!Collation.isContractionCE32(ce32));
257            handleCE32(start, end, ce32);
258        }
259        CharsTrie.Iterator suffixes = new CharsTrie(data.contexts, index + 2).iterator();
260        while (suffixes.hasNext()) {
261            Entry e = suffixes.next();
262            suffix = e.chars.toString();
263            addStrings(start, end, contractions);
264            if (unreversedPrefix.length() != 0) {
265                addStrings(start, end, expansions);
266            }
267            handleCE32(start, end, e.value);
268        }
269        suffix = null;
270    }
271
272    void addExpansions(int start, int end) {
273        if (unreversedPrefix.length() == 0 && suffix == null) {
274            if (expansions != null) {
275                expansions.add(start, end);
276            }
277        } else {
278            addStrings(start, end, expansions);
279        }
280    }
281
282    void addStrings(int start, int end, UnicodeSet set) {
283        if (set == null) {
284            return;
285        }
286        StringBuilder s = new StringBuilder(unreversedPrefix);
287        do {
288            s.appendCodePoint(start);
289            if (suffix != null) {
290                s.append(suffix);
291            }
292            set.add(s);
293            s.setLength(unreversedPrefix.length());
294        } while (++start <= end);
295    }
296
297    // Prefixes are reversed in the data structure.
298    private void setPrefix(CharSequence pfx) {
299        unreversedPrefix.setLength(0);
300        unreversedPrefix.append(pfx).reverse();
301    }
302
303    private void resetPrefix() {
304        unreversedPrefix.setLength(0);
305    }
306}