1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 1996-2014, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.text;
10
11import java.util.ArrayList;
12import java.util.HashSet;
13import java.util.Iterator;
14import java.util.List;
15import java.util.Set;
16
17import com.ibm.icu.impl.Norm2AllModes;
18import com.ibm.icu.impl.Normalizer2Impl;
19import com.ibm.icu.impl.Utility;
20import com.ibm.icu.lang.UCharacter;
21
22/**
23 * This class allows one to iterate through all the strings that are canonically equivalent to a given
24 * string. For example, here are some sample results:
25 * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
26 * <pre>
27 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
28 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
29 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
30 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
31 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
32 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
33 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
34 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
35 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
3610: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
3711: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
3812: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
39 *</pre>
40 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
41 * since it has not been optimized for that situation.
42 * @author M. Davis
43 * @stable ICU 2.4
44 */
45
46public final class CanonicalIterator {
47    /**
48     * Construct a CanonicalIterator object
49     * @param source string to get results for
50     * @stable ICU 2.4
51     */
52    public CanonicalIterator(String source) {
53        Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
54        nfd = allModes.decomp;
55        nfcImpl = allModes.impl.ensureCanonIterData();
56        setSource(source);
57    }
58
59    /**
60     * Gets the NFD form of the current source we are iterating over.
61     * @return gets the source: NOTE: it is the NFD form of the source originally passed in
62     * @stable ICU 2.4
63     */
64    public String getSource() {
65      return source;
66    }
67
68    /**
69     * Resets the iterator so that one can start again from the beginning.
70     * @stable ICU 2.4
71     */
72    public void reset() {
73        done = false;
74        for (int i = 0; i < current.length; ++i) {
75            current[i] = 0;
76        }
77    }
78
79    /**
80     * Get the next canonically equivalent string.
81     * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
82     * @return the next string that is canonically equivalent. The value null is returned when
83     * the iteration is done.
84     * @stable ICU 2.4
85     */
86    public String next() {
87        if (done) return null;
88
89        // construct return value
90
91        buffer.setLength(0); // delete old contents
92        for (int i = 0; i < pieces.length; ++i) {
93            buffer.append(pieces[i][current[i]]);
94        }
95        String result = buffer.toString();
96
97        // find next value for next time
98
99        for (int i = current.length - 1; ; --i) {
100            if (i < 0) {
101                done = true;
102                break;
103            }
104            current[i]++;
105            if (current[i] < pieces[i].length) break; // got sequence
106            current[i] = 0;
107        }
108        return result;
109    }
110
111    /**
112     * Set a new source for this iterator. Allows object reuse.
113     * @param newSource the source string to iterate against. This allows the same iterator to be used
114     * while changing the source string, saving object creation.
115     * @stable ICU 2.4
116     */
117    public void setSource(String newSource) {
118        source = nfd.normalize(newSource);
119        done = false;
120
121        // catch degenerate case
122        if (newSource.length() == 0) {
123            pieces = new String[1][];
124            current = new int[1];
125            pieces[0] = new String[]{""};
126            return;
127        }
128
129        // find the segments
130        List<String> segmentList = new ArrayList<String>();
131        int cp;
132        int start = 0;
133
134        // i should be the end of the first code point
135        // break up the string into segements
136
137        int i = UTF16.findOffsetFromCodePoint(source, 1);
138
139        for (; i < source.length(); i += Character.charCount(cp)) {
140            cp = source.codePointAt(i);
141            if (nfcImpl.isCanonSegmentStarter(cp)) {
142                segmentList.add(source.substring(start, i)); // add up to i
143                start = i;
144            }
145        }
146        segmentList.add(source.substring(start, i)); // add last one
147
148        // allocate the arrays, and find the strings that are CE to each segment
149        pieces = new String[segmentList.size()][];
150        current = new int[segmentList.size()];
151        for (i = 0; i < pieces.length; ++i) {
152            if (PROGRESS) System.out.println("SEGMENT");
153            pieces[i] = getEquivalents(segmentList.get(i));
154        }
155    }
156
157    /**
158     * Simple implementation of permutation.
159     * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
160     * @param source the string to find permutations for
161     * @param skipZeros set to true to skip characters with canonical combining class zero
162     * @param output the set to add the results to
163     * @internal
164     * @deprecated This API is ICU internal only.
165     */
166    @Deprecated
167    public static void permute(String source, boolean skipZeros, Set<String> output) {
168        // TODO: optimize
169        //if (PROGRESS) System.out.println("Permute: " + source);
170
171        // optimization:
172        // if zero or one character, just return a set with it
173        // we check for length < 2 to keep from counting code points all the time
174        if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
175            output.add(source);
176            return;
177        }
178
179        // otherwise iterate through the string, and recursively permute all the other characters
180        Set<String> subpermute = new HashSet<String>();
181        int cp;
182        for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
183            cp = UTF16.charAt(source, i);
184
185            // optimization:
186            // if the character is canonical combining class zero,
187            // don't permute it
188            if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
189                //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
190                continue;
191            }
192
193            // see what the permutations of the characters before and after this one are
194            subpermute.clear();
195            permute(source.substring(0,i)
196                + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
197
198            // prefix this character to all of them
199            String chStr = UTF16.valueOf(source, i);
200            for (String s : subpermute) {
201                String piece = chStr + s;
202                //if (PROGRESS) System.out.println("  Piece: " + piece);
203                output.add(piece);
204            }
205        }
206    }
207
208    // FOR TESTING
209
210    /*
211     *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
212     *
213    public static UnicodeSet getSafeStart() {
214        return (UnicodeSet) SAFE_START.clone();
215    }
216    */
217    /*
218     *@return the set of characters whose decompositions start with the given character
219     *
220    public static UnicodeSet getStarts(int cp) {
221        UnicodeSet result = AT_START.get(cp);
222        if (result == null) result = EMPTY;
223        return (UnicodeSet) result.clone();
224    }
225    */
226
227    // ===================== PRIVATES ==============================
228
229    // debug
230    private static boolean PROGRESS = false; // debug progress
231    //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
232    private static boolean SKIP_ZEROS = true;
233
234    // fields
235    private final Normalizer2 nfd;
236    private final Normalizer2Impl nfcImpl;
237    private String source;
238    private boolean done;
239    private String[][] pieces;
240    private int[] current;
241    // Note: C will need two more fields, since arrays there don't have lengths
242    // int pieces_length;
243    // int[] pieces_lengths;
244
245    // transient fields
246    private transient StringBuilder buffer = new StringBuilder();
247
248
249    // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
250    private String[] getEquivalents(String segment) {
251        Set<String> result = new HashSet<String>();
252        Set<String> basic = getEquivalents2(segment);
253        Set<String> permutations = new HashSet<String>();
254
255        // now get all the permutations
256        // add only the ones that are canonically equivalent
257        // TODO: optimize by not permuting any class zero.
258        Iterator<String> it = basic.iterator();
259        while (it.hasNext()) {
260            String item = it.next();
261            permutations.clear();
262            permute(item, SKIP_ZEROS, permutations);
263            Iterator<String> it2 = permutations.iterator();
264            while (it2.hasNext()) {
265                String possible = it2.next();
266
267/*
268                String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
269                if (attempt.equals(segment)) {
270*/
271                if (Normalizer.compare(possible, segment,0)==0) {
272
273                    if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
274                    result.add(possible);
275
276                } else {
277                    if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
278                }
279            }
280        }
281
282        // convert into a String[] to clean up storage
283        String[] finalResult = new String[result.size()];
284        result.toArray(finalResult);
285        return finalResult;
286    }
287
288
289    private Set<String> getEquivalents2(String segment) {
290
291        Set<String> result = new HashSet<String>();
292
293        if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
294
295        result.add(segment);
296        StringBuffer workingBuffer = new StringBuffer();
297        UnicodeSet starts = new UnicodeSet();
298
299        // cycle through all the characters
300        int cp;
301        for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
302
303            // see if any character is at the start of some decomposition
304            cp = segment.codePointAt(i);
305            if (!nfcImpl.getCanonStartSet(cp, starts)) {
306              continue;
307            }
308            // if so, see which decompositions match
309            for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
310                int cp2 = iter.codepoint;
311                Set<String> remainder = extract(cp2, segment, i, workingBuffer);
312                if (remainder == null) {
313                    continue;
314                }
315
316                // there were some matches, so add all the possibilities to the set.
317                String prefix= segment.substring(0,i);
318                prefix += UTF16.valueOf(cp2);
319                for (String item : remainder) {
320                    result.add(prefix + item);
321                }
322            }
323        }
324        return result;
325        /*
326        Set result = new HashSet();
327        if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
328        result.add(segment);
329        StringBuffer workingBuffer = new StringBuffer();
330
331        // cycle through all the characters
332        int cp;
333
334        for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
335            // see if any character is at the start of some decomposition
336            cp = UTF16.charAt(segment, i);
337            NormalizerImpl.getCanonStartSet(c,fillSet)
338            UnicodeSet starts = AT_START.get(cp);
339            if (starts == null) continue;
340            UnicodeSetIterator usi = new UnicodeSetIterator(starts);
341            // if so, see which decompositions match
342            while (usi.next()) {
343                int cp2 = usi.codepoint;
344                // we know that there are no strings in it
345                // so we don't have to check CharacterIterator.IS_STRING
346                Set remainder = extract(cp2, segment, i, workingBuffer);
347                if (remainder == null) continue;
348
349                // there were some matches, so add all the possibilities to the set.
350                String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
351                Iterator it = remainder.iterator();
352                while (it.hasNext()) {
353                    String item = (String) it.next();
354                    if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
355                    result.add(prefix + item);
356                }
357            }
358        }
359        return result;
360        */
361    }
362
363    /**
364     * See if the decomposition of cp2 is at segment starting at segmentPos
365     * (with canonical rearrangment!)
366     * If so, take the remainder, and return the equivalents
367     */
368    private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
369        if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
370            + ", " + Utility.hex(segment.substring(segmentPos)));
371
372        String decomp = nfcImpl.getDecomposition(comp);
373        if (decomp == null) {
374            decomp = UTF16.valueOf(comp);
375        }
376
377        // See if it matches the start of segment (at segmentPos)
378        boolean ok = false;
379        int cp;
380        int decompPos = 0;
381        int decompCp = UTF16.charAt(decomp,0);
382        decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
383        //int decompClass = getClass(decompCp);
384        buf.setLength(0); // initialize working buffer, shared among callees
385
386        for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
387            cp = UTF16.charAt(segment, i);
388            if (cp == decompCp) { // if equal, eat another cp from decomp
389                if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
390                if (decompPos == decomp.length()) { // done, have all decomp characters!
391                    buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
392                    ok = true;
393                    break;
394                }
395                decompCp = UTF16.charAt(decomp, decompPos);
396                decompPos += UTF16.getCharCount(decompCp);
397                //decompClass = getClass(decompCp);
398            } else {
399                if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
400                // brute force approach
401                UTF16.append(buf, cp);
402                /* TODO: optimize
403                // since we know that the classes are monotonically increasing, after zero
404                // e.g. 0 5 7 9 0 3
405                // we can do an optimization
406                // there are only a few cases that work: zero, less, same, greater
407                // if both classes are the same, we fail
408                // if the decomp class < the segment class, we fail
409
410                segClass = getClass(cp);
411                if (decompClass <= segClass) return null;
412                */
413            }
414        }
415        if (!ok) return null; // we failed, characters left over
416        if (PROGRESS) System.out.println("Matches");
417        if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
418        String remainder = buf.toString();
419
420        // brute force approach
421        // to check to make sure result is canonically equivalent
422        /*
423        String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
424        if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
425        */
426
427        if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
428
429        // get the remaining combinations
430        return getEquivalents2(remainder);
431    }
432
433    /*
434    // TODO: fix once we have a codepoint interface to get the canonical combining class
435    // TODO: Need public access to canonical combining class in UCharacter!
436    private static int getClass(int cp) {
437        return Normalizer.getClass((char)cp);
438    }
439    */
440
441   // ================= BUILDER =========================
442    // TODO: Flatten this data so it doesn't have to be reconstructed each time!
443
444    //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
445    private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
446    static {
447        SET_WITH_NULL_STRING.add("");
448    }
449
450  //  private static UnicodeSet SAFE_START = new UnicodeSet();
451  //  private static CharMap AT_START = new CharMap();
452
453        // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
454        // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
455  //  private static int LAST_UNICODE = 0x10FFFF;
456    /*
457    static {
458        buildData();
459    }
460    */
461    /*
462    private static void buildData() {
463
464        if (PROGRESS) System.out.println("Getting Safe Start");
465        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
466            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
467            int cc = UCharacter.getCombiningClass(cp);
468            if (cc == 0) SAFE_START.add(cp);
469            // will fix to be really safe below
470        }
471        if (PROGRESS) System.out.println();
472
473        if (PROGRESS) System.out.println("Getting Containment");
474        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
475            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
476
477            if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
478
479            //String istr = UTF16.valueOf(cp);
480            String decomp = Normalizer.normalize(cp, Normalizer.NFD);
481            //if (decomp.equals(istr)) continue;
482
483            // add each character in the decomposition to canBeIn
484
485            int component;
486            for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
487                component = UTF16.charAt(decomp, i);
488                if (i == 0) {
489                    AT_START.add(component, cp);
490                } else if (UCharacter.getCombiningClass(component) == 0) {
491                    SAFE_START.remove(component);
492                }
493            }
494        }
495        if (PROGRESS) System.out.println();
496    }
497        // the following is just for a map from characters to a set of characters
498
499    private static class CharMap {
500        Map storage = new HashMap();
501        MutableInt probe = new MutableInt();
502        boolean converted = false;
503
504        public void add(int cp, int whatItIsIn) {
505            UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
506            if (result == null) {
507                result = new UnicodeSet();
508                storage.put(probe, result);
509            }
510            result.add(whatItIsIn);
511        }
512
513        public UnicodeSet get(int cp) {
514            return (UnicodeSet) storage.get(probe.set(cp));
515        }
516    }
517
518    private static class MutableInt {
519        public int contents;
520        public int hashCode() { return contents; }
521        public boolean equals(Object other) {
522            return ((MutableInt)other).contents == contents;
523        }
524        // allows chaining
525        public MutableInt set(int contents) {
526            this.contents = contents;
527            return this;
528        }
529    }
530    */
531
532}
533