1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 1996-2014, International Business Machines Corporation and    *
7 * others. All Rights Reserved.                                                *
8 *******************************************************************************
9 */
10package android.icu.text;
11
12import java.util.ArrayList;
13import java.util.HashSet;
14import java.util.Iterator;
15import java.util.List;
16import java.util.Set;
17
18import android.icu.impl.Norm2AllModes;
19import android.icu.impl.Normalizer2Impl;
20import android.icu.impl.Utility;
21import android.icu.lang.UCharacter;
22
23/**
24 * This class allows one to iterate through all the strings that are canonically equivalent to a given
25 * string. For example, here are some sample results:
26 * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
27 * <pre>
28 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
29 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
30 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
31 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
32 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
33 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
34 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
35 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
36 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
3710: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
3811: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
3912: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
40 *</pre>
41 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
42 * since it has not been optimized for that situation.
43 * @author M. Davis
44 * @hide Only a subset of ICU is exposed in Android
45 */
46
47public final class CanonicalIterator {
48    /**
49     * Construct a CanonicalIterator object
50     * @param source string to get results for
51     */
52    public CanonicalIterator(String source) {
53        Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
54        nfd = allModes.decomp;
55        nfcImpl = allModes.impl.ensureCanonIterData();
56        setSource(source);
57    }
58
59    /**
60     * Gets the NFD form of the current source we are iterating over.
61     * @return gets the source: NOTE: it is the NFD form of the source originally passed in
62     */
63    public String getSource() {
64      return source;
65    }
66
67    /**
68     * Resets the iterator so that one can start again from the beginning.
69     */
70    public void reset() {
71        done = false;
72        for (int i = 0; i < current.length; ++i) {
73            current[i] = 0;
74        }
75    }
76
77    /**
78     * Get the next canonically equivalent string.
79     * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
80     * @return the next string that is canonically equivalent. The value null is returned when
81     * the iteration is done.
82     */
83    public String next() {
84        if (done) return null;
85
86        // construct return value
87
88        buffer.setLength(0); // delete old contents
89        for (int i = 0; i < pieces.length; ++i) {
90            buffer.append(pieces[i][current[i]]);
91        }
92        String result = buffer.toString();
93
94        // find next value for next time
95
96        for (int i = current.length - 1; ; --i) {
97            if (i < 0) {
98                done = true;
99                break;
100            }
101            current[i]++;
102            if (current[i] < pieces[i].length) break; // got sequence
103            current[i] = 0;
104        }
105        return result;
106    }
107
108    /**
109     * Set a new source for this iterator. Allows object reuse.
110     * @param newSource the source string to iterate against. This allows the same iterator to be used
111     * while changing the source string, saving object creation.
112     */
113    public void setSource(String newSource) {
114        source = nfd.normalize(newSource);
115        done = false;
116
117        // catch degenerate case
118        if (newSource.length() == 0) {
119            pieces = new String[1][];
120            current = new int[1];
121            pieces[0] = new String[]{""};
122            return;
123        }
124
125        // find the segments
126        List<String> segmentList = new ArrayList<String>();
127        int cp;
128        int start = 0;
129
130        // i should be the end of the first code point
131        // break up the string into segements
132
133        int i = UTF16.findOffsetFromCodePoint(source, 1);
134
135        for (; i < source.length(); i += Character.charCount(cp)) {
136            cp = source.codePointAt(i);
137            if (nfcImpl.isCanonSegmentStarter(cp)) {
138                segmentList.add(source.substring(start, i)); // add up to i
139                start = i;
140            }
141        }
142        segmentList.add(source.substring(start, i)); // add last one
143
144        // allocate the arrays, and find the strings that are CE to each segment
145        pieces = new String[segmentList.size()][];
146        current = new int[segmentList.size()];
147        for (i = 0; i < pieces.length; ++i) {
148            if (PROGRESS) System.out.println("SEGMENT");
149            pieces[i] = getEquivalents(segmentList.get(i));
150        }
151    }
152
153    /**
154     * Simple implementation of permutation.
155     * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
156     * @param source the string to find permutations for
157     * @param skipZeros set to true to skip characters with canonical combining class zero
158     * @param output the set to add the results to
159     * @deprecated This API is ICU internal only.
160     * @hide draft / provisional / internal are hidden on Android
161     */
162    @Deprecated
163    public static void permute(String source, boolean skipZeros, Set<String> output) {
164        // TODO: optimize
165        //if (PROGRESS) System.out.println("Permute: " + source);
166
167        // optimization:
168        // if zero or one character, just return a set with it
169        // we check for length < 2 to keep from counting code points all the time
170        if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
171            output.add(source);
172            return;
173        }
174
175        // otherwise iterate through the string, and recursively permute all the other characters
176        Set<String> subpermute = new HashSet<String>();
177        int cp;
178        for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
179            cp = UTF16.charAt(source, i);
180
181            // optimization:
182            // if the character is canonical combining class zero,
183            // don't permute it
184            if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
185                //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
186                continue;
187            }
188
189            // see what the permutations of the characters before and after this one are
190            subpermute.clear();
191            permute(source.substring(0,i)
192                + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
193
194            // prefix this character to all of them
195            String chStr = UTF16.valueOf(source, i);
196            for (String s : subpermute) {
197                String piece = chStr + s;
198                //if (PROGRESS) System.out.println("  Piece: " + piece);
199                output.add(piece);
200            }
201        }
202    }
203
204    // FOR TESTING
205
206    /*
207     *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
208     *
209    public static UnicodeSet getSafeStart() {
210        return (UnicodeSet) SAFE_START.clone();
211    }
212    */
213    /*
214     *@return the set of characters whose decompositions start with the given character
215     *
216    public static UnicodeSet getStarts(int cp) {
217        UnicodeSet result = AT_START.get(cp);
218        if (result == null) result = EMPTY;
219        return (UnicodeSet) result.clone();
220    }
221    */
222
223    // ===================== PRIVATES ==============================
224
225    // debug
226    private static boolean PROGRESS = false; // debug progress
227    //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
228    private static boolean SKIP_ZEROS = true;
229
230    // fields
231    private final Normalizer2 nfd;
232    private final Normalizer2Impl nfcImpl;
233    private String source;
234    private boolean done;
235    private String[][] pieces;
236    private int[] current;
237    // Note: C will need two more fields, since arrays there don't have lengths
238    // int pieces_length;
239    // int[] pieces_lengths;
240
241    // transient fields
242    private transient StringBuilder buffer = new StringBuilder();
243
244
245    // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
246    private String[] getEquivalents(String segment) {
247        Set<String> result = new HashSet<String>();
248        Set<String> basic = getEquivalents2(segment);
249        Set<String> permutations = new HashSet<String>();
250
251        // now get all the permutations
252        // add only the ones that are canonically equivalent
253        // TODO: optimize by not permuting any class zero.
254        Iterator<String> it = basic.iterator();
255        while (it.hasNext()) {
256            String item = it.next();
257            permutations.clear();
258            permute(item, SKIP_ZEROS, permutations);
259            Iterator<String> it2 = permutations.iterator();
260            while (it2.hasNext()) {
261                String possible = it2.next();
262
263/*
264                String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
265                if (attempt.equals(segment)) {
266*/
267                if (Normalizer.compare(possible, segment,0)==0) {
268
269                    if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
270                    result.add(possible);
271
272                } else {
273                    if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
274                }
275            }
276        }
277
278        // convert into a String[] to clean up storage
279        String[] finalResult = new String[result.size()];
280        result.toArray(finalResult);
281        return finalResult;
282    }
283
284
285    private Set<String> getEquivalents2(String segment) {
286
287        Set<String> result = new HashSet<String>();
288
289        if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
290
291        result.add(segment);
292        StringBuffer workingBuffer = new StringBuffer();
293        UnicodeSet starts = new UnicodeSet();
294
295        // cycle through all the characters
296        int cp;
297        for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
298
299            // see if any character is at the start of some decomposition
300            cp = segment.codePointAt(i);
301            if (!nfcImpl.getCanonStartSet(cp, starts)) {
302              continue;
303            }
304            // if so, see which decompositions match
305            for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
306                int cp2 = iter.codepoint;
307                Set<String> remainder = extract(cp2, segment, i, workingBuffer);
308                if (remainder == null) {
309                    continue;
310                }
311
312                // there were some matches, so add all the possibilities to the set.
313                String prefix= segment.substring(0,i);
314                prefix += UTF16.valueOf(cp2);
315                for (String item : remainder) {
316                    result.add(prefix + item);
317                }
318            }
319        }
320        return result;
321        /*
322        Set result = new HashSet();
323        if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
324        result.add(segment);
325        StringBuffer workingBuffer = new StringBuffer();
326
327        // cycle through all the characters
328        int cp;
329
330        for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
331            // see if any character is at the start of some decomposition
332            cp = UTF16.charAt(segment, i);
333            NormalizerImpl.getCanonStartSet(c,fillSet)
334            UnicodeSet starts = AT_START.get(cp);
335            if (starts == null) continue;
336            UnicodeSetIterator usi = new UnicodeSetIterator(starts);
337            // if so, see which decompositions match
338            while (usi.next()) {
339                int cp2 = usi.codepoint;
340                // we know that there are no strings in it
341                // so we don't have to check CharacterIterator.IS_STRING
342                Set remainder = extract(cp2, segment, i, workingBuffer);
343                if (remainder == null) continue;
344
345                // there were some matches, so add all the possibilities to the set.
346                String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
347                Iterator it = remainder.iterator();
348                while (it.hasNext()) {
349                    String item = (String) it.next();
350                    if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
351                    result.add(prefix + item);
352                }
353            }
354        }
355        return result;
356        */
357    }
358
359    /**
360     * See if the decomposition of cp2 is at segment starting at segmentPos
361     * (with canonical rearrangment!)
362     * If so, take the remainder, and return the equivalents
363     */
364    private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
365        if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
366            + ", " + Utility.hex(segment.substring(segmentPos)));
367
368        String decomp = nfcImpl.getDecomposition(comp);
369        if (decomp == null) {
370            decomp = UTF16.valueOf(comp);
371        }
372
373        // See if it matches the start of segment (at segmentPos)
374        boolean ok = false;
375        int cp;
376        int decompPos = 0;
377        int decompCp = UTF16.charAt(decomp,0);
378        decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
379        //int decompClass = getClass(decompCp);
380        buf.setLength(0); // initialize working buffer, shared among callees
381
382        for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
383            cp = UTF16.charAt(segment, i);
384            if (cp == decompCp) { // if equal, eat another cp from decomp
385                if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
386                if (decompPos == decomp.length()) { // done, have all decomp characters!
387                    buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
388                    ok = true;
389                    break;
390                }
391                decompCp = UTF16.charAt(decomp, decompPos);
392                decompPos += UTF16.getCharCount(decompCp);
393                //decompClass = getClass(decompCp);
394            } else {
395                if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
396                // brute force approach
397                UTF16.append(buf, cp);
398                /* TODO: optimize
399                // since we know that the classes are monotonically increasing, after zero
400                // e.g. 0 5 7 9 0 3
401                // we can do an optimization
402                // there are only a few cases that work: zero, less, same, greater
403                // if both classes are the same, we fail
404                // if the decomp class < the segment class, we fail
405
406                segClass = getClass(cp);
407                if (decompClass <= segClass) return null;
408                */
409            }
410        }
411        if (!ok) return null; // we failed, characters left over
412        if (PROGRESS) System.out.println("Matches");
413        if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
414        String remainder = buf.toString();
415
416        // brute force approach
417        // to check to make sure result is canonically equivalent
418        /*
419        String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
420        if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
421        */
422
423        if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
424
425        // get the remaining combinations
426        return getEquivalents2(remainder);
427    }
428
429    /*
430    // TODO: fix once we have a codepoint interface to get the canonical combining class
431    // TODO: Need public access to canonical combining class in UCharacter!
432    private static int getClass(int cp) {
433        return Normalizer.getClass((char)cp);
434    }
435    */
436
437   // ================= BUILDER =========================
438    // TODO: Flatten this data so it doesn't have to be reconstructed each time!
439
440    //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
441    private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
442    static {
443        SET_WITH_NULL_STRING.add("");
444    }
445
446  //  private static UnicodeSet SAFE_START = new UnicodeSet();
447  //  private static CharMap AT_START = new CharMap();
448
449        // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
450        // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
451  //  private static int LAST_UNICODE = 0x10FFFF;
452    /*
453    static {
454        buildData();
455    }
456    */
457    /*
458    private static void buildData() {
459
460        if (PROGRESS) System.out.println("Getting Safe Start");
461        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
462            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
463            int cc = UCharacter.getCombiningClass(cp);
464            if (cc == 0) SAFE_START.add(cp);
465            // will fix to be really safe below
466        }
467        if (PROGRESS) System.out.println();
468
469        if (PROGRESS) System.out.println("Getting Containment");
470        for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
471            if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
472
473            if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
474
475            //String istr = UTF16.valueOf(cp);
476            String decomp = Normalizer.normalize(cp, Normalizer.NFD);
477            //if (decomp.equals(istr)) continue;
478
479            // add each character in the decomposition to canBeIn
480
481            int component;
482            for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
483                component = UTF16.charAt(decomp, i);
484                if (i == 0) {
485                    AT_START.add(component, cp);
486                } else if (UCharacter.getCombiningClass(component) == 0) {
487                    SAFE_START.remove(component);
488                }
489            }
490        }
491        if (PROGRESS) System.out.println();
492    }
493        // the following is just for a map from characters to a set of characters
494
495    private static class CharMap {
496        Map storage = new HashMap();
497        MutableInt probe = new MutableInt();
498        boolean converted = false;
499
500        public void add(int cp, int whatItIsIn) {
501            UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
502            if (result == null) {
503                result = new UnicodeSet();
504                storage.put(probe, result);
505            }
506            result.add(whatItIsIn);
507        }
508
509        public UnicodeSet get(int cp) {
510            return (UnicodeSet) storage.get(probe.set(cp));
511        }
512    }
513
514    private static class MutableInt {
515        public int contents;
516        public int hashCode() { return contents; }
517        public boolean equals(Object other) {
518            return ((MutableInt)other).contents == contents;
519        }
520        // allows chaining
521        public MutableInt set(int contents) {
522            this.contents = contents;
523            return this;
524        }
525    }
526    */
527
528}
529