1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 ***************************************************************************
6 * Copyright (C) 2008-2016 International Business Machines Corporation
7 * and others. All Rights Reserved.
8 ***************************************************************************
9 *
10 * Unicode Spoof Detection
11 */
12
13package android.icu.text;
14
15import java.io.IOException;
16import java.io.LineNumberReader;
17import java.io.Reader;
18import java.nio.ByteBuffer;
19import java.text.ParseException;
20import java.util.ArrayList;
21import java.util.Arrays;
22import java.util.BitSet;
23import java.util.Collections;
24import java.util.Comparator;
25import java.util.HashSet;
26import java.util.Hashtable;
27import java.util.LinkedHashSet;
28import java.util.Locale;
29import java.util.MissingResourceException;
30import java.util.Set;
31import java.util.Vector;
32import java.util.regex.Matcher;
33import java.util.regex.Pattern;
34
35import android.icu.impl.ICUBinary;
36import android.icu.impl.ICUBinary.Authenticate;
37import android.icu.impl.Utility;
38import android.icu.lang.UCharacter;
39import android.icu.lang.UCharacterCategory;
40import android.icu.lang.UProperty;
41import android.icu.lang.UScript;
42import android.icu.util.ULocale;
43
44/**
45 * <p>
46 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
47 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
48 *
49 * <ol>
50 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desordenado" and
51 * "ԁеѕогԁепаԁо".</li>
52 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
53 * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
54 * </ol>
55 *
56 * <p>
57 * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
58 * <code>SpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
59 * content filters.
60 *
61 * <h2>Confusables</h2>
62 *
63 * <p>
64 * The following example shows how to use <code>SpoofChecker</code> to check for confusability between two strings:
65 *
66 * <pre>
67 * <code>
68 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
69 * int result = sc.areConfusable("desordenado", "ԁеѕогԁепаԁо");
70 * System.out.println(result != 0);  // true
71 * </code>
72 * </pre>
73 *
74 * <p>
75 * <code>SpoofChecker</code> uses a builder paradigm: options are specified within the context of a lightweight
76 * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading
77 * operations are performed, and an immutable <code>SpoofChecker</code> is returned.
78 *
79 * <p>
80 * The first line of the example creates a <code>SpoofChecker</code> object with confusable-checking enabled; the second
81 * line performs the confusability test. For best performance, the instance should be created once (e.g., upon
82 * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime.
83 *
84 * <p>
85 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a
86 * sequence of families of confusable characters, where each family has a single exemplar character.
87 * {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is
88 * equivalent to the example above:
89 *
90 * <pre>
91 * <code>
92 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
93 * boolean result = sc.getSkeleton("desordenado").equals(sc.getSkeleton("ԁеѕогԁепаԁо"));
94 * System.out.println(result);  // true
95 * </code>
96 * </pre>
97 *
98 * <p>
99 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
100 * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as
101 * shown below:
102 *
103 * <pre>
104 * // Setup:
105 * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example
106 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
107 * HashSet&lt;String&gt; skeletons = new HashSet&lt;String&gt;();
108 * for (String word : DICTIONARY) {
109 *   skeletons.add(sc.getSkeleton(word));
110 * }
111 *
112 * // Live Check:
113 * boolean result = skeletons.contains(sc.getSkeleton("1orern"));
114 * System.out.println(result);  // true
115 * </pre>
116 *
117 * <p>
118 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
119 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
120 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
121 *
122 * <h2>Spoof Detection</h2>
123 *
124 * <p>
125 * The following snippet shows a minimal example of using <code>SpoofChecker</code> to perform spoof detection on a
126 * string:
127 *
128 * <pre>
129 * SpoofChecker sc = new SpoofChecker.Builder()
130 *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
131 *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
132 *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
133 *     .build();
134 * boolean result = sc.failsChecks("pаypаl");  // with Cyrillic 'а' characters
135 * System.out.println(result);  // true
136 * </pre>
137 *
138 * <p>
139 * As in the case for confusability checking, it is good practice to create one <code>SpoofChecker</code> instance at
140 * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of
141 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the
142 * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the
143 * instance to perform confusability checking.
144 *
145 * <p>
146 * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}:
147 *
148 * <pre>
149 * <code>
150 * SpoofChecker sc = new SpoofChecker.Builder()
151 *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
152 *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
153 *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
154 *     .build();
155 * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult();
156 * boolean result = sc.failsChecks("pаypаl", checkResult);
157 * System.out.println(checkResult.checks);  // 16
158 * </code>
159 * </pre>
160 *
161 * <p>
162 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
163 * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
164 *
165 * <ul>
166 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
167 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
168 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
169 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
170 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
171 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
172 * characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.</li>
173 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
174 * </ul>
175 *
176 * <p>
177 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
178 * INVISIBLE and MIXED_NUMBERS conditions, you could do:
179 *
180 * <pre>
181 * <code>
182 * SpoofChecker sc = new SpoofChecker.Builder()
183 *     .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS)
184 *     .build();
185 * boolean result = sc.failsChecks("৪8");
186 * System.out.println(result);  // true
187 * </code>
188 * </pre>
189 *
190 * <p>
191 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
192 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
193 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
194 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
195 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
196 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
197 * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of
198 * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code
199 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
200 * scripts.
201 *
202 * <h2>Additional Information</h2>
203 *
204 * <p>
205 * A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
206 *
207 * <p>
208 * <b>Thread Safety:</b> The methods on <code>SpoofChecker</code> objects are thread safe. The test functions for
209 * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called
210 * concurrently from multiple threads using the same <code>SpoofChecker</code> instance.
211 *
212 * @hide Only a subset of ICU is exposed in Android
213 */
214public class SpoofChecker {
215
216    /**
217     * Constants from UTS 39 for use in setRestrictionLevel.
218     */
219    public enum RestrictionLevel {
220        /**
221         * All characters in the string are in the identifier profile and all characters in the string are in the ASCII
222         * range.
223         */
224        ASCII,
225        /**
226         * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the
227         * string is single-script, according to the definition in UTS 39 section 5.1.
228         */
229        SINGLE_SCRIPT_RESTRICTIVE,
230        /**
231         * The string classifies as Single Script, or all characters in the string are in the identifier profile and the
232         * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1:
233         * <ul>
234         * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
235         * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
236         * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
237         * </ul>
238         */
239        HIGHLY_RESTRICTIVE,
240        /**
241         * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
242         * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
243         * Greek, and Cherokee.
244         */
245        MODERATELY_RESTRICTIVE,
246        /**
247         * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as
248         * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us.
249         */
250        MINIMALLY_RESTRICTIVE,
251        /**
252         * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
253         */
254        UNRESTRICTIVE,
255    }
256
257    /**
258     * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
259     *
260     * @hide draft / provisional / internal are hidden on Android
261     */
262    public static final UnicodeSet INCLUSION = new UnicodeSet(
263            "['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C\\u200D\\u2010\\u"
264                    + "2019\\u2027\\u30A0\\u30FB]").freeze();
265    // Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt
266    // There is tooling to generate this constant in the unicodetools project:
267    //      org.unicode.text.tools.RecommendedSetGenerator
268    // It will print the Java and C++ code to the console for easy copy-paste into this file.
269
270    /**
271     * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
272     *
273     * @hide draft / provisional / internal are hidden on Android
274     */
275    public static final UnicodeSet RECOMMENDED = new UnicodeSet(
276            "[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u014"
277                    + "8\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E"
278                    + "6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02B"
279                    + "C\\u02EC\\u0300-\\u0304\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u03"
280                    + "28\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386"
281                    + "\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u05"
282                    + "2E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0"
283                    + "620-\\u063F\\u0641-\\u0655\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-"
284                    + "\\u06D3\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2"
285                    + "\\u08B6-\\u08BD\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096"
286                    + "F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u0"
287                    + "9A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE\\u"
288                    + "09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-"
289                    + "\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\"
290                    + "u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A9"
291                    + "3-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0"
292                    + "ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\"
293                    + "u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47"
294                    + "\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83"
295                    + "\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3"
296                    + "\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0B"
297                    + "D0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u"
298                    + "0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56"
299                    + "\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92"
300                    + "-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0"
301                    + "CD5\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D02\\u0D03\\u0D05-\\u0D0C\\u0"
302                    + "D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-\\u0D57"
303                    + "\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D9"
304                    + "6\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0"
305                    + "DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\"
306                    + "u0E59\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u"
307                    + "0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD\\"
308                    + "u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00\\u0F20-\\u0F29"
309                    + "\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F"
310                    + "56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0"
311                    + "F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6"
312                    + "\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10"
313                    + "C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u"
314                    + "1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2"
315                    + "-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1"
316                    + "315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-"
317                    + "\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1C80-\\u1C88\\u1E00-\\u1E9"
318                    + "9\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1"
319                    + "F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F"
320                    + "7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1"
321                    + "FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-"
322                    + "\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0"
323                    + "-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u3"
324                    + "005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-\\u30FA\\u30FC-\\u30FE\\u"
325                    + "3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660\\uA661\\uA674-\\uA67B"
326                    + "\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D\\uA78E\\uA790-\\uA793\\uA7A0-\\uA7AA\\uA7AE"
327                    + "\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB"
328                    + "11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13\\uF"
329                    + "A14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6\\U0002A700-\\U0"
330                    + "002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]").freeze();
331    // Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt
332    // There is tooling to generate this constant in the unicodetools project:
333    //      org.unicode.text.tools.RecommendedSetGenerator
334    // It will print the Java and C++ code to the console for easy copy-paste into this file.
335
336    /**
337     * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of
338     * checks that will be performed, and to report results from the check function.
339     *
340     */
341
342    /**
343     * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
344     * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
345     * 4.
346     */
347    public static final int SINGLE_SCRIPT_CONFUSABLE = 1;
348
349    /**
350     * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
351     * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
352     * 39 section 4.
353     */
354    public static final int MIXED_SCRIPT_CONFUSABLE = 2;
355
356    /**
357     * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
358     * that the two strings are visually confusable and that they are not from the same script but both of them are
359     * single-script strings, according to UTS 39 section 4.
360     */
361    public static final int WHOLE_SCRIPT_CONFUSABLE = 4;
362
363    /**
364     * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the
365     * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make
366     * {@link SpoofChecker#areConfusable} return only those types of confusables.
367     *
368     * @hide draft / provisional / internal are hidden on Android
369     */
370    public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE;
371
372    /**
373     * This flag is deprecated and no longer affects the behavior of SpoofChecker.
374     *
375     * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was
376     * deprecated.
377     */
378    @Deprecated
379    public static final int ANY_CASE = 8;
380
381    /**
382     * Check that an identifier satisfies the requirements for the restriction level specified in
383     * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is
384     * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}.
385     *
386     * @hide draft / provisional / internal are hidden on Android
387     */
388    public static final int RESTRICTION_LEVEL = 16;
389
390    /**
391     * Check that an identifier contains only characters from a single script (plus chars from the common and inherited
392     * scripts.) Applies to checks of a single identifier check only.
393     *
394     * @deprecated ICU 51 Use RESTRICTION_LEVEL
395     */
396    @Deprecated
397    public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL;
398
399    /**
400     * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences
401     * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not
402     * test the input string as a whole for conformance to any particular syntax for identifiers.
403     */
404    public static final int INVISIBLE = 32;
405
406    /**
407     * Check that an identifier contains only characters from a specified set of acceptable characters. See
408     * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check
409     * will also fail the {@link #RESTRICTION_LEVEL} check.
410     */
411    public static final int CHAR_LIMIT = 64;
412
413    /**
414     * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39
415     * section 5.3.
416     *
417     * @hide draft / provisional / internal are hidden on Android
418     */
419    public static final int MIXED_NUMBERS = 128;
420
421    // Update CheckResult.toString() when a new check is added.
422
423    /**
424     * Enable all spoof checks.
425     */
426    public static final int ALL_CHECKS = 0xFFFFFFFF;
427
428    // Used for checking for ASCII-Only restriction level
429    static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
430
431    /**
432     * private constructor: a SpoofChecker has to be built by the builder
433     */
434    private SpoofChecker() {
435    }
436
437    /**
438     * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired
439     * checking options on the builder, then call the build() function to create a SpoofChecker instance.
440     */
441    public static class Builder {
442        int fChecks; // Bit vector of checks to perform.
443        SpoofData fSpoofData;
444        final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters.
445        // for this Spoof Checker. Defaults to all chars.
446        final Set<ULocale> fAllowedLocales = new LinkedHashSet<ULocale>(); // The list of allowed locales.
447        private RestrictionLevel fRestrictionLevel;
448
449        /**
450         * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
451         * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes
452         * to the default checking behavior.
453         */
454        public Builder() {
455            fChecks = ALL_CHECKS;
456            fSpoofData = null;
457            fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE;
458        }
459
460        /**
461         * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker.
462         *
463         * @param src
464         *            The existing checker.
465         */
466        public Builder(SpoofChecker src) {
467            fChecks = src.fChecks;
468            fSpoofData = src.fSpoofData; // For the data, we will either use the source data
469                                         // as-is, or drop the builder's reference to it
470                                         // and generate new data, depending on what our
471                                         // caller does with the builder.
472            fAllowedCharsSet.set(src.fAllowedCharsSet);
473            fAllowedLocales.addAll(src.fAllowedLocales);
474            fRestrictionLevel = src.fRestrictionLevel;
475        }
476
477        /**
478         * Create a SpoofChecker with current configuration.
479         *
480         * @return SpoofChecker
481         */
482        public SpoofChecker build() {
483            // TODO: Make this data loading be lazy (see #12696).
484            if (fSpoofData == null) {
485                // read binary file
486                fSpoofData = SpoofData.getDefault();
487            }
488
489            // Copy all state from the builder to the new SpoofChecker.
490            // Make sure that everything is either cloned or copied, so
491            // that subsequent re-use of the builder won't modify the built
492            // SpoofChecker.
493            //
494            // One exception to this: the SpoofData is just assigned.
495            // If the builder subsequently needs to modify fSpoofData
496            // it will create a new SpoofData object first.
497
498            SpoofChecker result = new SpoofChecker();
499            result.fChecks = this.fChecks;
500            result.fSpoofData = this.fSpoofData;
501            result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
502            result.fAllowedCharsSet.freeze();
503            result.fAllowedLocales = new HashSet<ULocale>(this.fAllowedLocales);
504            result.fRestrictionLevel = this.fRestrictionLevel;
505            return result;
506        }
507
508        /**
509         * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file
510         * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for
511         * these files, and the content of these files is acceptable input.
512         *
513         * @param confusables
514         *            the Reader of confusable characters definitions, as found in file confusables.txt from
515         *            unicode.org.
516         * @throws ParseException
517         *             To report syntax errors in the input.
518         *
519         * @hide draft / provisional / internal are hidden on Android
520         */
521        public Builder setData(Reader confusables) throws ParseException, IOException {
522
523            // Compile the binary data from the source (text) format.
524            // Drop the builder's reference to any pre-existing data, which may
525            // be in use in an already-built checker.
526
527            fSpoofData = new SpoofData();
528            ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData);
529            return this;
530        }
531
532        /**
533         * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead.
534         *
535         * @param confusables
536         *            the Reader of confusable characters definitions, as found in file confusables.txt from
537         *            unicode.org.
538         * @param confusablesWholeScript
539         *            No longer supported.
540         * @throws ParseException
541         *             To report syntax errors in the input.
542         *
543         * @deprecated ICU 58
544         */
545        @Deprecated
546        public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException {
547            setData(confusables);
548            return this;
549        }
550
551        /**
552         * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method
553         * overwrites any checks that may have already been enabled. By default, all checks are enabled.
554         *
555         * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For
556         * example, to fail strings containing characters outside of the set specified by {@link #setAllowedChars} and
557         * also strings that contain digits from mixed numbering systems:
558         *
559         * <pre>
560         * {@code
561         * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS);
562         * }
563         * </pre>
564         *
565         * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from
566         * ALL_CHECKS. For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality,
567         * it is good practice to disable the CONFUSABLE check:
568         *
569         * <pre>
570         * {@code
571         * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE);
572         * }
573         * </pre>
574         *
575         * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and
576         * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
577         * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
578         * methods.
579         *
580         * @param checks
581         *            The set of checks that this spoof checker will perform. The value is an 'or' of the desired
582         *            checks.
583         * @return self
584         */
585        public Builder setChecks(int checks) {
586            // Verify that the requested checks are all ones (bits) that
587            // are acceptable, known values.
588            if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) {
589                throw new IllegalArgumentException("Bad Spoof Checks value.");
590            }
591            this.fChecks = (checks & SpoofChecker.ALL_CHECKS);
592            return this;
593        }
594
595        /**
596         * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
597         * associated with the specified locales. Any previously specified list of locales is replaced by the new
598         * settings.
599         *
600         * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is
601         * determined. Characters from this set of scripts, along with characters from the "common" and "inherited"
602         * Unicode Script categories will be permitted.
603         *
604         * Supplying an empty string removes all restrictions; characters from any script will be allowed.
605         *
606         * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a
607         * non-empty list of locales.
608         *
609         * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function.
610         * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters.
611         *
612         * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of
613         * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with
614         * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}.
615         *
616         * @param locales
617         *            A Set of ULocales, from which the language and associated script are extracted. If the locales Set
618         *            is null, no restrictions will be placed on the allowed characters.
619         *
620         * @return self
621         */
622        public Builder setAllowedLocales(Set<ULocale> locales) {
623            fAllowedCharsSet.clear();
624
625            for (ULocale locale : locales) {
626                // Add the script chars for this locale to the accumulating set
627                // of allowed chars.
628                addScriptChars(locale, fAllowedCharsSet);
629            }
630
631            // If our caller provided an empty list of locales, we disable the
632            // allowed characters checking
633            fAllowedLocales.clear();
634            if (locales.size() == 0) {
635                fAllowedCharsSet.add(0, 0x10ffff);
636                fChecks &= ~CHAR_LIMIT;
637                return this;
638            }
639
640            // Add all common and inherited characters to the set of allowed
641            // chars.
642            UnicodeSet tempSet = new UnicodeSet();
643            tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON);
644            fAllowedCharsSet.addAll(tempSet);
645            tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED);
646            fAllowedCharsSet.addAll(tempSet);
647
648            // Store the updated spoof checker state.
649            fAllowedLocales.clear();
650            fAllowedLocales.addAll(locales);
651            fChecks |= CHAR_LIMIT;
652            return this;
653        }
654
655        /**
656         * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
657         * associated with the specified locales. Any previously specified list of locales is replaced by the new
658         * settings.
659         *
660         * @param locales
661         *            A Set of Locales, from which the language and associated script are extracted. If the locales Set
662         *            is null, no restrictions will be placed on the allowed characters.
663         *
664         * @return self
665         */
666        public Builder setAllowedJavaLocales(Set<Locale> locales) {
667            HashSet<ULocale> ulocales = new HashSet<ULocale>(locales.size());
668            for (Locale locale : locales) {
669                ulocales.add(ULocale.forLocale(locale));
670            }
671            return setAllowedLocales(ulocales);
672        }
673
674        // Add (union) to the UnicodeSet all of the characters for the scripts
675        // used for the specified locale. Part of the implementation of
676        // setAllowedLocales.
677        private void addScriptChars(ULocale locale, UnicodeSet allowedChars) {
678            int scripts[] = UScript.getCode(locale);
679            if (scripts != null) {
680                UnicodeSet tmpSet = new UnicodeSet();
681                for (int i = 0; i < scripts.length; i++) {
682                    tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]);
683                    allowedChars.addAll(tmpSet);
684                }
685            }
686            // else it's an unknown script.
687            // Maybe they asked for the script of "zxx", which refers to no linguistic content.
688            // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU.
689        }
690
691        /**
692         * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit
693         * is is replaced by the new settings. This includes limits on characters that were set with the
694         * setAllowedLocales() function. Note that the RESTRICTED set is useful.
695         *
696         * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function.
697         *
698         * @param chars
699         *            A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by
700         *            this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling
701         *            this function. Note that this clears the allowedLocales set.
702         * @return self
703         */
704        public Builder setAllowedChars(UnicodeSet chars) {
705            fAllowedCharsSet.set(chars);
706            fAllowedLocales.clear();
707            fChecks |= CHAR_LIMIT;
708            return this;
709        }
710
711        /**
712         * Set the loosest restriction level allowed for strings. The default if this is not called is
713         * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and
714         * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
715         * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}.
716         *
717         * @param restrictionLevel
718         *            The loosest restriction level allowed.
719         * @return self
720         * @hide draft / provisional / internal are hidden on Android
721         */
722        public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
723            fRestrictionLevel = restrictionLevel;
724            fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS;
725            return this;
726        }
727
728        /*
729         * *****************************************************************************
730         * Internal classes for compililing confusable data into its binary (runtime) form.
731         * *****************************************************************************
732         */
733        // ---------------------------------------------------------------------
734        //
735        // buildConfusableData Compile the source confusable data, as defined by
736        // the Unicode data file confusables.txt, into the binary
737        // structures used by the confusable detector.
738        //
739        // The binary structures are described in uspoof_impl.h
740        //
741        // 1. parse the data, making a hash table mapping from a codepoint to a String.
742        //
743        // 2. Sort all of the strings encountered by length, since they will need to
744        // be stored in that order in the final string table.
745        // TODO: Sorting these strings by length is no longer needed since the removal of
746        // the string lengths table.  This logic can be removed to save processing time
747        // when building confusables data.
748        //
749        // 3. Build a list of keys (UChar32s) from the mapping table. Sort the
750        // list because that will be the ordering of our runtime table.
751        //
752        // 4. Generate the run time string table. This is generated before the key & value
753        // table because we need the string indexes when building those tables.
754        //
755        // 5. Build the run-time key and value table. These are parallel tables, and
756        // are built at the same time
757
758        // class ConfusabledataBuilder
759        // An instance of this class exists while the confusable data is being built from source.
760        // It encapsulates the intermediate data structures that are used for building.
761        // It exports one static function, to do a confusable data build.
762        private static class ConfusabledataBuilder {
763
764            private Hashtable<Integer, SPUString> fTable;
765            private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the
766                                        // four mapping tables.
767
768            // The compiled data is first assembled into the following four collections,
769            // then output to the builder's SpoofData object.
770            private StringBuffer fStringTable;
771            private ArrayList<Integer> fKeyVec;
772            private ArrayList<Integer> fValueVec;
773            private SPUStringPool stringPool;
774            private Pattern fParseLine;
775            private Pattern fParseHexNum;
776            private int fLineNum;
777
778            ConfusabledataBuilder() {
779                fTable = new Hashtable<Integer, SPUString>();
780                fKeySet = new UnicodeSet();
781                fKeyVec = new ArrayList<Integer>();
782                fValueVec = new ArrayList<Integer>();
783                stringPool = new SPUStringPool();
784            }
785
786            void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException {
787                StringBuffer fInput = new StringBuffer();
788
789                // Convert the user input data from UTF-8 to char (UTF-16)
790                LineNumberReader lnr = new LineNumberReader(confusables);
791                do {
792                    String line = lnr.readLine();
793                    if (line == null) {
794                        break;
795                    }
796                    fInput.append(line);
797                    fInput.append('\n');
798                } while (true);
799
800                // Regular Expression to parse a line from Confusables.txt. The expression will match
801                // any line. What was matched is determined by examining which capture groups have a match.
802                // Capture Group 1: the source char
803                // Capture Group 2: the replacement chars
804                // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
805                // Capture Group 7: A blank or comment only line.
806                // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
807                // Example Line from the confusables.txt source file:
808                // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
809                fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char
810                        "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s)
811                        "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued)
812                        "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type
813                        "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment
814                        "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment
815                        "|^(.*?)$"); // OR match any line, which catches illegal lines.
816
817                // Regular expression for parsing a hex number out of a space-separated list of them.
818                // Capture group 1 gets the number, with spaces removed.
819                fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)");
820
821                // Zap any Byte Order Mark at the start of input. Changing it to a space
822                // is benign given the syntax of the input.
823                if (fInput.charAt(0) == 0xfeff) {
824                    fInput.setCharAt(0, (char) 0x20);
825                }
826
827                // Parse the input, one line per iteration of this loop.
828                Matcher matcher = fParseLine.matcher(fInput);
829                while (matcher.find()) {
830                    fLineNum++;
831                    if (matcher.start(7) >= 0) {
832                        // this was a blank or comment line.
833                        continue;
834                    }
835                    if (matcher.start(8) >= 0) {
836                        // input file syntax error.
837                        // status = U_PARSE_ERROR;
838                        throw new ParseException(
839                                "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8),
840                                matcher.start(8));
841                    }
842
843                    // We have a good input line. Extract the key character and mapping
844                    // string, and
845                    // put them into the appropriate mapping table.
846                    int keyChar = Integer.parseInt(matcher.group(1), 16);
847                    if (keyChar > 0x10ffff) {
848                        throw new ParseException(
849                                "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1),
850                                matcher.start(1));
851                    }
852                    Matcher m = fParseHexNum.matcher(matcher.group(2));
853
854                    StringBuilder mapString = new StringBuilder();
855                    while (m.find()) {
856                        int c = Integer.parseInt(m.group(1), 16);
857                        if (c > 0x10ffff) {
858                            throw new ParseException(
859                                    "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16),
860                                    matcher.start(2));
861                        }
862                        mapString.appendCodePoint(c);
863                    }
864                    assert (mapString.length() >= 1);
865
866                    // Put the map (value) string into the string pool
867                    // This a little like a Java intern() - any duplicates will be
868                    // eliminated.
869                    SPUString smapString = stringPool.addString(mapString.toString());
870
871                    // Add the char . string mapping to the table.
872                    // For Unicode 8, the SL, SA and ML tables have been discontinued.
873                    // All input data from confusables.txt is tagged MA.
874                    fTable.put(keyChar, smapString);
875
876                    fKeySet.add(keyChar);
877                }
878
879                // Input data is now all parsed and collected.
880                // Now create the run-time binary form of the data.
881                //
882                // This is done in two steps. First the data is assembled into vectors and strings,
883                // for ease of construction, then the contents of these collections are copied
884                // into the actual SpoofData object.
885
886                // Build up the string array, and record the index of each string therein
887                // in the (build time only) string pool.
888                // Strings of length one are not entered into the strings array.
889                // (Strings in the table are sorted by length)
890
891                stringPool.sort();
892                fStringTable = new StringBuffer();
893                int poolSize = stringPool.size();
894                int i;
895                for (i = 0; i < poolSize; i++) {
896                    SPUString s = stringPool.getByIndex(i);
897                    int strLen = s.fStr.length();
898                    int strIndex = fStringTable.length();
899                    if (strLen == 1) {
900                        // strings of length one do not get an entry in the string table.
901                        // Keep the single string character itself here, which is the same
902                        // convention that is used in the final run-time string table index.
903                        s.fCharOrStrTableIndex = s.fStr.charAt(0);
904                    } else {
905                        s.fCharOrStrTableIndex = strIndex;
906                        fStringTable.append(s.fStr);
907                    }
908                }
909
910                // Construct the compile-time Key and Value table.
911                //
912                // The keys in the Key table follow the format described in uspoof.h for the
913                // Cfu confusables data structure.
914                //
915                // Starting in ICU 58, each code point has exactly one entry in the data
916                // structure.
917
918                for (String keyCharStr : fKeySet) {
919                    int keyChar = keyCharStr.codePointAt(0);
920                    SPUString targetMapping = fTable.get(keyChar);
921                    assert targetMapping != null;
922
923                    // Throw a sane exception if trying to consume a long string.  Otherwise,
924                    // codePointAndLengthToKey will throw an assertion error.
925                    if (targetMapping.fStr.length() > 256) {
926                        throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries.");
927                    }
928
929                    int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length());
930                    int value = targetMapping.fCharOrStrTableIndex;
931
932                    fKeyVec.add(key);
933                    fValueVec.add(value);
934                }
935
936                // Put the assembled data into the destination SpoofData object.
937
938                // The Key Table
939                // While copying the keys to the output array,
940                // also sanity check that the keys are sorted.
941                int numKeys = fKeyVec.size();
942                dest.fCFUKeys = new int[numKeys];
943                int previousCodePoint = 0;
944                for (i = 0; i < numKeys; i++) {
945                    int key = fKeyVec.get(i);
946                    int codePoint = ConfusableDataUtils.keyToCodePoint(key);
947                    // strictly greater because there can be only one entry per code point
948                    assert codePoint > previousCodePoint;
949                    dest.fCFUKeys[i] = key;
950                    previousCodePoint = codePoint;
951                }
952
953                // The Value Table, parallels the key table
954                int numValues = fValueVec.size();
955                assert (numKeys == numValues);
956                dest.fCFUValues = new short[numValues];
957                i = 0;
958                for (int value : fValueVec) {
959                    assert (value < 0xffff);
960                    dest.fCFUValues[i++] = (short) value;
961                }
962
963                // The Strings Table.
964                dest.fCFUStrings = fStringTable.toString();
965            }
966
967            public static void buildConfusableData(Reader confusables, SpoofData dest)
968                    throws java.io.IOException, ParseException {
969                ConfusabledataBuilder builder = new ConfusabledataBuilder();
970                builder.build(confusables, dest);
971            }
972
973            /*
974             * *****************************************************************************
975             * Internal classes for compiling confusable data into its binary (runtime) form.
976             * *****************************************************************************
977             */
978            // SPUString
979            // Holds a string that is the result of one of the mappings defined
980            // by the confusable mapping data (confusables.txt from Unicode.org)
981            // Instances of SPUString exist during the compilation process only.
982
983            private static class SPUString {
984                String fStr; // The actual string.
985                int fCharOrStrTableIndex; // Index into the final runtime data for this string.
986                // (or, for length 1, the single string char itself,
987                // there being no string table entry for it.)
988
989                SPUString(String s) {
990                    fStr = s;
991                    fCharOrStrTableIndex = 0;
992                }
993            }
994
995            // Comparison function for ordering strings in the string pool.
996            // Compare by length first, then, within a group of the same length,
997            // by code point order.
998
999            private static class SPUStringComparator implements Comparator<SPUString> {
1000                @Override
1001                public int compare(SPUString sL, SPUString sR) {
1002                    int lenL = sL.fStr.length();
1003                    int lenR = sR.fStr.length();
1004                    if (lenL < lenR) {
1005                        return -1;
1006                    } else if (lenL > lenR) {
1007                        return 1;
1008                    } else {
1009                        return sL.fStr.compareTo(sR.fStr);
1010                    }
1011                }
1012
1013                final static SPUStringComparator INSTANCE = new SPUStringComparator();
1014            }
1015
1016            // String Pool A utility class for holding the strings that are the result of
1017            // the spoof mappings. These strings will utimately end up in the
1018            // run-time String Table.
1019            // This is sort of like a sorted set of strings, except that ICU's anemic
1020            // built-in collections don't support those, so it is implemented with a
1021            // combination of a uhash and a Vector.
1022            private static class SPUStringPool {
1023                public SPUStringPool() {
1024                    fVec = new Vector<SPUString>();
1025                    fHash = new Hashtable<String, SPUString>();
1026                }
1027
1028                public int size() {
1029                    return fVec.size();
1030                }
1031
1032                // Get the n-th string in the collection.
1033                public SPUString getByIndex(int index) {
1034                    SPUString retString = fVec.elementAt(index);
1035                    return retString;
1036                }
1037
1038                // Add a string. Return the string from the table.
1039                // If the input parameter string is already in the table, delete the
1040                // input parameter and return the existing string.
1041                public SPUString addString(String src) {
1042                    SPUString hashedString = fHash.get(src);
1043                    if (hashedString == null) {
1044                        hashedString = new SPUString(src);
1045                        fHash.put(src, hashedString);
1046                        fVec.addElement(hashedString);
1047                    }
1048                    return hashedString;
1049                }
1050
1051                // Sort the contents; affects the ordering of getByIndex().
1052                public void sort() {
1053                    Collections.sort(fVec, SPUStringComparator.INSTANCE);
1054                }
1055
1056                private Vector<SPUString> fVec; // Elements are SPUString *
1057                private Hashtable<String, SPUString> fHash; // Key: Value:
1058            }
1059
1060        }
1061    }
1062
1063    /**
1064     * Get the Restriction Level that is being tested.
1065     *
1066     * @return The restriction level
1067     * @deprecated This API is ICU internal only.
1068     * @hide draft / provisional / internal are hidden on Android
1069     */
1070    @Deprecated
1071    public RestrictionLevel getRestrictionLevel() {
1072        return fRestrictionLevel;
1073    }
1074
1075    /**
1076     * Get the set of checks that this Spoof Checker has been configured to perform.
1077     *
1078     * @return The set of checks that this spoof checker will perform.
1079     */
1080    public int getChecks() {
1081        return fChecks;
1082    }
1083
1084    /**
1085     * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on
1086     * scripts have been specified, an empty set will be returned.
1087     *
1088     * setAllowedChars() will reset the list of allowed locales to be empty.
1089     *
1090     * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales();
1091     * the information other than languages from the originally specified locales may be omitted.
1092     *
1093     * @return A set of locales corresponding to the acceptable scripts.
1094     */
1095    public Set<ULocale> getAllowedLocales() {
1096        return Collections.unmodifiableSet(fAllowedLocales);
1097    }
1098
1099    /**
1100     * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If
1101     * no limitations on scripts have been specified, an empty set will be returned.
1102     *
1103     * @return A set of locales corresponding to the acceptable scripts.
1104     */
1105    public Set<Locale> getAllowedJavaLocales() {
1106        HashSet<Locale> locales = new HashSet<Locale>(fAllowedLocales.size());
1107        for (ULocale uloc : fAllowedLocales) {
1108            locales.add(uloc.toLocale());
1109        }
1110        return locales;
1111    }
1112
1113    /**
1114     * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set
1115     * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by
1116     * this function.
1117     *
1118     * The returned set will be frozen, meaning that it cannot be modified by the caller.
1119     *
1120     * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test.
1121     */
1122    public UnicodeSet getAllowedChars() {
1123        return fAllowedCharsSet;
1124    }
1125
1126    /**
1127     * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed.
1128     */
1129    public static class CheckResult {
1130        /**
1131         * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
1132         * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on.
1133         *
1134         * @see Builder#setChecks
1135         */
1136        public int checks;
1137
1138        /**
1139         * The index of the first string position that failed a check.
1140         *
1141         * @deprecated ICU 51. No longer supported. Always set to zero.
1142         */
1143        @Deprecated
1144        public int position;
1145
1146        /**
1147         * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null.  The set will contain the zero
1148         * digit from each decimal number system found in the input string.
1149         *
1150         * @hide draft / provisional / internal are hidden on Android
1151         */
1152        public UnicodeSet numerics;
1153
1154        /**
1155         * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null.
1156         *
1157         * @hide draft / provisional / internal are hidden on Android
1158         */
1159        public RestrictionLevel restrictionLevel;
1160
1161        /**
1162         * Default constructor
1163         */
1164        public CheckResult() {
1165            checks = 0;
1166            position = 0;
1167        }
1168
1169        /**
1170         * {@inheritDoc}
1171         */
1172        @Override
1173        public String toString() {
1174            StringBuilder sb = new StringBuilder();
1175            sb.append("checks:");
1176            if (checks == 0) {
1177                sb.append(" none");
1178            } else if (checks == ALL_CHECKS) {
1179                sb.append(" all");
1180            } else {
1181                if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) {
1182                    sb.append(" SINGLE_SCRIPT_CONFUSABLE");
1183                }
1184                if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) {
1185                    sb.append(" MIXED_SCRIPT_CONFUSABLE");
1186                }
1187                if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) {
1188                    sb.append(" WHOLE_SCRIPT_CONFUSABLE");
1189                }
1190                if ((checks & ANY_CASE) != 0) {
1191                    sb.append(" ANY_CASE");
1192                }
1193                if ((checks & RESTRICTION_LEVEL) != 0) {
1194                    sb.append(" RESTRICTION_LEVEL");
1195                }
1196                if ((checks & INVISIBLE) != 0) {
1197                    sb.append(" INVISIBLE");
1198                }
1199                if ((checks & CHAR_LIMIT) != 0) {
1200                    sb.append(" CHAR_LIMIT");
1201                }
1202                if ((checks & MIXED_NUMBERS) != 0) {
1203                    sb.append(" MIXED_NUMBERS");
1204                }
1205            }
1206            sb.append(", numerics: ").append(numerics.toPattern(false));
1207            sb.append(", position: ").append(position);
1208            sb.append(", restrictionLevel: ").append(restrictionLevel);
1209            return sb.toString();
1210        }
1211    }
1212
1213    /**
1214     * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1215     * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1216     *
1217     * @param text
1218     *            A String to be checked for possible security issues.
1219     * @param checkResult
1220     *            Output parameter, indicates which specific tests failed. May be null if the information is not wanted.
1221     * @return True there any issue is found with the input string.
1222     */
1223    public boolean failsChecks(String text, CheckResult checkResult) {
1224        int length = text.length();
1225
1226        int result = 0;
1227        if (checkResult != null) {
1228            checkResult.position = 0;
1229            checkResult.numerics = null;
1230            checkResult.restrictionLevel = null;
1231        }
1232
1233        if (0 != (this.fChecks & RESTRICTION_LEVEL)) {
1234            RestrictionLevel textRestrictionLevel = getRestrictionLevel(text);
1235            if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) {
1236                result |= RESTRICTION_LEVEL;
1237            }
1238            if (checkResult != null) {
1239                checkResult.restrictionLevel = textRestrictionLevel;
1240            }
1241        }
1242
1243        if (0 != (this.fChecks & MIXED_NUMBERS)) {
1244            UnicodeSet numerics = new UnicodeSet();
1245            getNumerics(text, numerics);
1246            if (numerics.size() > 1) {
1247                result |= MIXED_NUMBERS;
1248            }
1249            if (checkResult != null) {
1250                checkResult.numerics = numerics;
1251            }
1252        }
1253
1254        if (0 != (this.fChecks & CHAR_LIMIT)) {
1255            int i;
1256            int c;
1257            for (i = 0; i < length;) {
1258                // U16_NEXT(text, i, length, c);
1259                c = Character.codePointAt(text, i);
1260                i = Character.offsetByCodePoints(text, i, 1);
1261                if (!this.fAllowedCharsSet.contains(c)) {
1262                    result |= CHAR_LIMIT;
1263                    break;
1264                }
1265            }
1266        }
1267
1268        if (0 != (this.fChecks & INVISIBLE)) {
1269            // This check needs to be done on NFD input
1270            String nfdText = nfdNormalizer.normalize(text);
1271
1272            // scan for more than one occurrence of the same non-spacing mark
1273            // in a sequence of non-spacing marks.
1274            int i;
1275            int c;
1276            int firstNonspacingMark = 0;
1277            boolean haveMultipleMarks = false;
1278            UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a
1279                                                          // single combining sequence.
1280            for (i = 0; i < length;) {
1281                c = Character.codePointAt(nfdText, i);
1282                i = Character.offsetByCodePoints(nfdText, i, 1);
1283                if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) {
1284                    firstNonspacingMark = 0;
1285                    if (haveMultipleMarks) {
1286                        marksSeenSoFar.clear();
1287                        haveMultipleMarks = false;
1288                    }
1289                    continue;
1290                }
1291                if (firstNonspacingMark == 0) {
1292                    firstNonspacingMark = c;
1293                    continue;
1294                }
1295                if (!haveMultipleMarks) {
1296                    marksSeenSoFar.add(firstNonspacingMark);
1297                    haveMultipleMarks = true;
1298                }
1299                if (marksSeenSoFar.contains(c)) {
1300                    // report the error, and stop scanning.
1301                    // No need to find more than the first failure.
1302                    result |= INVISIBLE;
1303                    break;
1304                }
1305                marksSeenSoFar.add(c);
1306            }
1307        }
1308        if (checkResult != null) {
1309            checkResult.checks = result;
1310        }
1311        return (0 != result);
1312    }
1313
1314    /**
1315     * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1316     * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1317     *
1318     * @param text
1319     *            A String to be checked for possible security issues.
1320     * @return True there any issue is found with the input string.
1321     */
1322    public boolean failsChecks(String text) {
1323        return failsChecks(text, null);
1324    }
1325
1326    /**
1327     * Check the whether two specified strings are visually confusable. The types of confusability to be tested - single
1328     * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker.
1329     *
1330     * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
1331     * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
1332     *
1333     * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
1334     * folded for comparison and display to the user, do not select the ANY_CASE option.
1335     *
1336     *
1337     * @param s1
1338     *            The first of the two strings to be compared for confusability.
1339     * @param s2
1340     *            The second of the two strings to be compared for confusability.
1341     * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
1342     *         found, as defined by spoof check test constants.
1343     */
1344    public int areConfusable(String s1, String s2) {
1345        //
1346        // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
1347        // and for definitions of the types (single, whole, mixed-script) of confusables.
1348
1349        // We only care about a few of the check flags. Ignore the others.
1350        // If no tests relevant to this function have been specified, signal an error.
1351        // TODO: is this really the right thing to do? It's probably an error on
1352        // the caller's part, but logically we would just return 0 (no error).
1353        if ((this.fChecks & CONFUSABLE) == 0) {
1354            throw new IllegalArgumentException("No confusable checks are enabled.");
1355        }
1356
1357        // Compute the skeletons and check for confusability.
1358        String s1Skeleton = getSkeleton(s1);
1359        String s2Skeleton = getSkeleton(s2);
1360        if (!s1Skeleton.equals(s2Skeleton)) {
1361            return 0;
1362        }
1363
1364        // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
1365        // of confusables according to UTS 39 section 4.
1366        // Start by computing the resolved script sets of s1 and s2.
1367        ScriptSet s1RSS = new ScriptSet();
1368        getResolvedScriptSet(s1, s1RSS);
1369        ScriptSet s2RSS = new ScriptSet();
1370        getResolvedScriptSet(s2, s2RSS);
1371
1372        // Turn on all applicable flags
1373        int result = 0;
1374        if (s1RSS.intersects(s2RSS)) {
1375            result |= SINGLE_SCRIPT_CONFUSABLE;
1376        } else {
1377            result |= MIXED_SCRIPT_CONFUSABLE;
1378            if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
1379                result |= WHOLE_SCRIPT_CONFUSABLE;
1380            }
1381        }
1382
1383        // Turn off flags that the user doesn't want
1384        result &= fChecks;
1385
1386        return result;
1387    }
1388
1389    /**
1390     * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are
1391     * confusable if their skeletons are identical. See Unicode UAX 39 for additional information.
1392     *
1393     * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
1394     * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
1395     *
1396     * Skeletons are computed using the algorithm and data described in Unicode UAX 39.
1397     *
1398     * @param str
1399     *            The input string whose skeleton will be generated.
1400     * @return The output skeleton string.
1401     *
1402     * @hide draft / provisional / internal are hidden on Android
1403     */
1404    public String getSkeleton(CharSequence str) {
1405        // Apply the skeleton mapping to the NFD normalized input string
1406        // Accumulate the skeleton, possibly unnormalized, in a String.
1407        String nfdId = nfdNormalizer.normalize(str);
1408        int normalizedLen = nfdId.length();
1409        StringBuilder skelSB = new StringBuilder();
1410        for (int inputIndex = 0; inputIndex < normalizedLen;) {
1411            int c = Character.codePointAt(nfdId, inputIndex);
1412            inputIndex += Character.charCount(c);
1413            this.fSpoofData.confusableLookup(c, skelSB);
1414        }
1415        String skelStr = skelSB.toString();
1416        skelStr = nfdNormalizer.normalize(skelStr);
1417        return skelStr;
1418    }
1419
1420    /**
1421     * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been
1422     * ignored, and starting with ICU 58, this function has been deprecated.
1423     *
1424     * @param type
1425     *            No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA.
1426     * @param id
1427     *            The input identifier whose skeleton will be generated.
1428     * @return The output skeleton string.
1429     *
1430     * @deprecated ICU 58
1431     */
1432    @Deprecated
1433    public String getSkeleton(int type, String id) {
1434        return getSkeleton(id);
1435    }
1436
1437    /**
1438     * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have
1439     * enabled the same set of checks.
1440     *
1441     * @param other
1442     *            the SpoofChecker being compared with.
1443     * @return true if the two SpoofCheckers are equal.
1444     * @hide draft / provisional / internal are hidden on Android
1445     */
1446    @Override
1447    public boolean equals(Object other) {
1448        if (!(other instanceof SpoofChecker)) {
1449            return false;
1450        }
1451        SpoofChecker otherSC = (SpoofChecker) other;
1452        if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) {
1453            return false;
1454        }
1455        if (fChecks != otherSC.fChecks) {
1456            return false;
1457        }
1458        if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null
1459                && !fAllowedLocales.equals(otherSC.fAllowedLocales)) {
1460            return false;
1461        }
1462        if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null
1463                && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) {
1464            return false;
1465        }
1466        if (fRestrictionLevel != otherSC.fRestrictionLevel) {
1467            return false;
1468        }
1469        return true;
1470    }
1471
1472    /**
1473     * @hide draft / provisional / internal are hidden on Android
1474     */
1475    @Override
1476    public int hashCode() {
1477        return fChecks
1478                ^ fSpoofData.hashCode()
1479                ^ fAllowedLocales.hashCode()
1480                ^ fAllowedCharsSet.hashCode()
1481                ^ fRestrictionLevel.ordinal();
1482    }
1483
1484    /**
1485     * Computes the augmented script set for a code point, according to UTS 39 section 5.1.
1486     */
1487    private static void getAugmentedScriptSet(int codePoint, ScriptSet result) {
1488        result.clear();
1489        UScript.getScriptExtensions(codePoint, result);
1490
1491        // Section 5.1 step 1
1492        if (result.get(UScript.HAN)) {
1493            result.set(UScript.HAN_WITH_BOPOMOFO);
1494            result.set(UScript.JAPANESE);
1495            result.set(UScript.KOREAN);
1496        }
1497        if (result.get(UScript.HIRAGANA)) {
1498            result.set(UScript.JAPANESE);
1499        }
1500        if (result.get(UScript.KATAKANA)) {
1501            result.set(UScript.JAPANESE);
1502        }
1503        if (result.get(UScript.HANGUL)) {
1504            result.set(UScript.KOREAN);
1505        }
1506        if (result.get(UScript.BOPOMOFO)) {
1507            result.set(UScript.HAN_WITH_BOPOMOFO);
1508        }
1509
1510        // Section 5.1 step 2
1511        if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) {
1512            result.setAll();
1513        }
1514    }
1515
1516    /**
1517     * Computes the resolved script set for a string, according to UTS 39 section 5.1.
1518     */
1519    private void getResolvedScriptSet(CharSequence input, ScriptSet result) {
1520        getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result);
1521    }
1522
1523    /**
1524     * Computes the resolved script set for a string, omitting characters having the specified script. If
1525     * UScript.CODE_LIMIT is passed as the second argument, all characters are included.
1526     */
1527    private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) {
1528        result.setAll();
1529
1530        ScriptSet temp = new ScriptSet();
1531        for (int utf16Offset = 0; utf16Offset < input.length();) {
1532            int codePoint = Character.codePointAt(input, utf16Offset);
1533            utf16Offset += Character.charCount(codePoint);
1534
1535            // Compute the augmented script set for the character
1536            getAugmentedScriptSet(codePoint, temp);
1537
1538            // Intersect the augmented script set with the resolved script set, but only if the character doesn't
1539            // have the script specified in the function call
1540            if (script == UScript.CODE_LIMIT || !temp.get(script)) {
1541                result.and(temp);
1542            }
1543        }
1544    }
1545
1546    /**
1547     * Computes the set of numerics for a string, according to UTS 39 section 5.3.
1548     */
1549    private void getNumerics(String input, UnicodeSet result) {
1550        result.clear();
1551
1552        for (int utf16Offset = 0; utf16Offset < input.length();) {
1553            int codePoint = Character.codePointAt(input, utf16Offset);
1554            utf16Offset += Character.charCount(codePoint);
1555
1556            // Store a representative character for each kind of decimal digit
1557            if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
1558                // Store the zero character as a representative for comparison.
1559                // Unicode guarantees it is codePoint - value
1560                result.add(codePoint - UCharacter.getNumericValue(codePoint));
1561            }
1562        }
1563    }
1564
1565    /**
1566     * Computes the restriction level of a string, according to UTS 39 section 5.2.
1567     */
1568    private RestrictionLevel getRestrictionLevel(String input) {
1569        // Section 5.2 step 1:
1570        if (!fAllowedCharsSet.containsAll(input)) {
1571            return RestrictionLevel.UNRESTRICTIVE;
1572        }
1573
1574        // Section 5.2 step 2:
1575        if (ASCII.containsAll(input)) {
1576            return RestrictionLevel.ASCII;
1577        }
1578
1579        // Section 5.2 steps 3:
1580        ScriptSet resolvedScriptSet = new ScriptSet();
1581        getResolvedScriptSet(input, resolvedScriptSet);
1582
1583        // Section 5.2 step 4:
1584        if (!resolvedScriptSet.isEmpty()) {
1585            return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE;
1586        }
1587
1588        // Section 5.2 step 5:
1589        ScriptSet resolvedNoLatn = new ScriptSet();
1590        getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn);
1591
1592        // Section 5.2 step 6:
1593        if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE)
1594                || resolvedNoLatn.get(UScript.KOREAN)) {
1595            return RestrictionLevel.HIGHLY_RESTRICTIVE;
1596        }
1597
1598        // Section 5.2 step 7:
1599        if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK)
1600                && !resolvedNoLatn.get(UScript.CHEROKEE)) {
1601            return RestrictionLevel.MODERATELY_RESTRICTIVE;
1602        }
1603
1604        // Section 5.2 step 8:
1605        return RestrictionLevel.MINIMALLY_RESTRICTIVE;
1606    }
1607
1608    // Data Members
1609    private int fChecks; // Bit vector of checks to perform.
1610    private SpoofData fSpoofData;
1611    private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
1612    private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
1613    private RestrictionLevel fRestrictionLevel;
1614
1615    private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance();
1616
1617    // Confusable Mappings Data Structures, version 2.0
1618    //
1619    // This description and the corresponding implementation are to be kept
1620    // in-sync with the copy in icu4c uspoof_impl.h.
1621    //
1622    // For the confusable data, we are essentially implementing a map,
1623    //     key: a code point
1624    //     value: a string. Most commonly one char in length, but can be more.
1625    //
1626    // The keys are stored as a sorted array of 32 bit ints.
1627    //         bits 0-23 a code point value
1628    //         bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
1629    //     The key table is sorted in ascending code point order. (not on the
1630    //     32 bit int value, the flag bits do not participate in the sorting.)
1631    //
1632    //     Lookup is done by means of a binary search in the key table.
1633    //
1634    // The corresponding values are kept in a parallel array of 16 bit ints.
1635    //     If the value string is of length 1, it is literally in the value array.
1636    //     For longer strings, the value array contains an index into the strings
1637    //     table.
1638    //
1639    // String Table:
1640    //     The strings table contains all of the value strings (those of length two or greater)
1641    //     concatentated together into one long char (UTF-16) array.
1642    //
1643    //     There is no nul character or other mark between adjacent strings.
1644    //
1645    //----------------------------------------------------------------------------
1646    //
1647    //  Changes from format version 1 to format version 2:
1648    //        1) Removal of the whole-script confusable data tables.
1649    //        2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask.
1650    //        3) Expansion of string length value in the key bitmask from 2 bits to 8 bits.
1651    //        4) Removal of the string lengths table since 8 bits is sufficient for the
1652    //           lengths of all entries in confusables.txt.
1653    //
1654    private static final class ConfusableDataUtils {
1655        public static final int FORMAT_VERSION = 2; // version for ICU 58
1656
1657        public static final int keyToCodePoint(int key) {
1658            return key & 0x00ffffff;
1659        }
1660
1661        public static final int keyToLength(int key) {
1662            return ((key & 0xff000000) >> 24) + 1;
1663        }
1664
1665        public static final int codePointAndLengthToKey(int codePoint, int length) {
1666            assert (codePoint & 0x00ffffff) == codePoint;
1667            assert length <= 256;
1668            return codePoint | ((length - 1) << 24);
1669        }
1670    }
1671
1672    // -------------------------------------------------------------------------------------
1673    //
1674    // SpoofData
1675    //
1676    // This class corresponds to the ICU SpoofCheck data.
1677    //
1678    // The data can originate with the Binary ICU data that is generated in ICU4C,
1679    // or it can originate from source rules that are compiled in ICU4J.
1680    //
1681    // This class does not include the set of checks to be performed, but only
1682    // data that is serialized into the ICU binary data.
1683    //
1684    // Because Java cannot easily wrap binary data like ICU4C, the binary data is
1685    // copied into Java structures that are convenient for use by the run time code.
1686    //
1687    // ---------------------------------------------------------------------------------------
1688    private static class SpoofData {
1689
1690        // The Confusable data, Java data structures for.
1691        int[] fCFUKeys;
1692        short[] fCFUValues;
1693        String fCFUStrings;
1694
1695        private static final int DATA_FORMAT = 0x43667520; // "Cfu "
1696
1697        private static final class IsAcceptable implements Authenticate {
1698            @Override
1699            public boolean isDataVersionAcceptable(byte version[]) {
1700                return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0
1701                        || version[3] != 0;
1702            }
1703        }
1704
1705        private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
1706
1707        private static final class DefaultData {
1708            private static SpoofData INSTANCE = null;
1709            private static IOException EXCEPTION = null;
1710
1711            static {
1712                // Note: Although this is static, the Java runtime can delay execution of this block until
1713                // the data is actually requested via SpoofData.getDefault().
1714                try {
1715                    INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu"));
1716                } catch (IOException e) {
1717                    EXCEPTION = e;
1718                }
1719            }
1720        }
1721
1722        /**
1723         * @return instance for Unicode standard data
1724         */
1725        public static SpoofData getDefault() {
1726            if (DefaultData.EXCEPTION != null) {
1727                throw new MissingResourceException(
1728                        "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(),
1729                        "SpoofChecker", "");
1730            }
1731            return DefaultData.INSTANCE;
1732        }
1733
1734        // SpoofChecker Data constructor for use from data builder.
1735        // Initializes a new, empty data area that will be populated later.
1736        private SpoofData() {
1737        }
1738
1739        // Constructor for use when creating from prebuilt default data.
1740        // A ByteBuffer is what the ICU internal data loading functions provide.
1741        private SpoofData(ByteBuffer bytes) throws java.io.IOException {
1742            ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE);
1743            bytes.mark();
1744            readData(bytes);
1745        }
1746
1747        @Override
1748        public boolean equals(Object other) {
1749            if (!(other instanceof SpoofData)) {
1750                return false;
1751            }
1752            SpoofData otherData = (SpoofData) other;
1753            if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys))
1754                return false;
1755            if (!Arrays.equals(fCFUValues, otherData.fCFUValues))
1756                return false;
1757            if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null
1758                    && !fCFUStrings.equals(otherData.fCFUStrings))
1759                return false;
1760            return true;
1761        }
1762
1763        @Override
1764        public int hashCode() {
1765            return Arrays.hashCode(fCFUKeys)
1766                    ^ Arrays.hashCode(fCFUValues)
1767                    ^ fCFUStrings.hashCode();
1768        }
1769
1770        // Set the SpoofChecker data from pre-built binary data in a byte buffer.
1771        // The binary data format is as described for ICU4C spoof data.
1772        //
1773        private void readData(ByteBuffer bytes) throws java.io.IOException {
1774            int magic = bytes.getInt();
1775            if (magic != 0x3845fdef) {
1776                throw new IllegalArgumentException("Bad Spoof Check Data.");
1777            }
1778            @SuppressWarnings("unused")
1779            int dataFormatVersion = bytes.getInt();
1780            @SuppressWarnings("unused")
1781            int dataLength = bytes.getInt();
1782
1783            int CFUKeysOffset = bytes.getInt();
1784            int CFUKeysSize = bytes.getInt();
1785
1786            int CFUValuesOffset = bytes.getInt();
1787            int CFUValuesSize = bytes.getInt();
1788
1789            int CFUStringTableOffset = bytes.getInt();
1790            int CFUStringTableSize = bytes.getInt();
1791
1792            // We have now read the file header, and obtained the position for each
1793            // of the data items. Now read each in turn, first seeking the
1794            // input stream to the position of the data item.
1795
1796            bytes.reset();
1797            ICUBinary.skipBytes(bytes, CFUKeysOffset);
1798            fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0);
1799
1800            bytes.reset();
1801            ICUBinary.skipBytes(bytes, CFUValuesOffset);
1802            fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0);
1803
1804            bytes.reset();
1805            ICUBinary.skipBytes(bytes, CFUStringTableOffset);
1806            fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0);
1807        }
1808
1809        /**
1810         * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be
1811         * appended will between 1 and 18 characters as of Unicode 9.
1812         *
1813         * This is the heart of the confusable skeleton generation implementation.
1814         */
1815        public void confusableLookup(int inChar, StringBuilder dest) {
1816            // Perform a binary search.
1817            // [lo, hi), i.e lo is inclusive, hi is exclusive.
1818            // The result after the loop will be in lo.
1819            int lo = 0;
1820            int hi = length();
1821            do {
1822                int mid = (lo + hi) / 2;
1823                if (codePointAt(mid) > inChar) {
1824                    hi = mid;
1825                } else if (codePointAt(mid) < inChar) {
1826                    lo = mid;
1827                } else {
1828                    // Found result. Break early.
1829                    lo = mid;
1830                    break;
1831                }
1832            } while (hi - lo > 1);
1833
1834            // Did we find an entry? If not, the char maps to itself.
1835            if (codePointAt(lo) != inChar) {
1836                dest.appendCodePoint(inChar);
1837                return;
1838            }
1839
1840            // Add the element to the string builder and return.
1841            appendValueTo(lo, dest);
1842            return;
1843        }
1844
1845        /**
1846         * Return the number of confusable entries in this SpoofData.
1847         *
1848         * @return The number of entries.
1849         */
1850        public int length() {
1851            return fCFUKeys.length;
1852        }
1853
1854        /**
1855         * Return the code point (key) at the specified index.
1856         *
1857         * @param index
1858         *            The index within the SpoofData.
1859         * @return The code point.
1860         */
1861        public int codePointAt(int index) {
1862            return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]);
1863        }
1864
1865        /**
1866         * Append the confusable skeleton at the specified index to the StringBuilder dest.
1867         *
1868         * @param index
1869         *            The index within the SpoofData.
1870         * @param dest
1871         *            The StringBuilder to which to append the skeleton.
1872         */
1873        public void appendValueTo(int index, StringBuilder dest) {
1874            int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]);
1875
1876            // Value is either a char (for strings of length 1) or
1877            // an index into the string table (for longer strings)
1878            short value = fCFUValues[index];
1879            if (stringLength == 1) {
1880                dest.append((char) value);
1881            } else {
1882                dest.append(fCFUStrings, value, value + stringLength);
1883            }
1884        }
1885    }
1886
1887    // -------------------------------------------------------------------------------
1888    //
1889    // ScriptSet - Script code bit sets.
1890    // Extends Java BitSet with input/output support and a few helper methods.
1891    // Note: The I/O is not currently being used, so it has been commented out. If
1892    // it is needed again, the code can be restored.
1893    //
1894    // -------------------------------------------------------------------------------
1895    static class ScriptSet extends BitSet {
1896
1897        // Eclipse default value to quell warnings:
1898        private static final long serialVersionUID = 1L;
1899
1900        // // The serialized version of this class can hold INT_CAPACITY * 32 scripts.
1901        // private static final int INT_CAPACITY = 6;
1902        // private static final long serialVersionUID = INT_CAPACITY;
1903        // static {
1904        // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT;
1905        // }
1906        //
1907        // public ScriptSet() {
1908        // }
1909        //
1910        // public ScriptSet(ByteBuffer bytes) throws java.io.IOException {
1911        // for (int i = 0; i < INT_CAPACITY; i++) {
1912        // int bits = bytes.getInt();
1913        // for (int j = 0; j < Integer.SIZE; j++) {
1914        // if ((bits & (1 << j)) != 0) {
1915        // set(i * Integer.SIZE + j);
1916        // }
1917        // }
1918        // }
1919        // }
1920        //
1921        // public void output(DataOutputStream os) throws java.io.IOException {
1922        // for (int i = 0; i < INT_CAPACITY; i++) {
1923        // int bits = 0;
1924        // for (int j = 0; j < Integer.SIZE; j++) {
1925        // if (get(i * Integer.SIZE + j)) {
1926        // bits |= (1 << j);
1927        // }
1928        // }
1929        // os.writeInt(bits);
1930        // }
1931        // }
1932
1933        public void and(int script) {
1934            this.clear(0, script);
1935            this.clear(script + 1, UScript.CODE_LIMIT);
1936        }
1937
1938        public void setAll() {
1939            this.set(0, UScript.CODE_LIMIT);
1940        }
1941
1942        public boolean isFull() {
1943            return cardinality() == UScript.CODE_LIMIT;
1944        }
1945
1946        public void appendStringTo(StringBuilder sb) {
1947            sb.append("{ ");
1948            if (isEmpty()) {
1949                sb.append("- ");
1950            } else if (isFull()) {
1951                sb.append("* ");
1952            } else {
1953                for (int script = 0; script < UScript.CODE_LIMIT; script++) {
1954                    if (get(script)) {
1955                        sb.append(UScript.getShortName(script));
1956                        sb.append(" ");
1957                    }
1958                }
1959            }
1960            sb.append("}");
1961        }
1962
1963        @Override
1964        public String toString() {
1965            StringBuilder sb = new StringBuilder();
1966            sb.append("<ScriptSet ");
1967            appendStringTo(sb);
1968            sb.append(">");
1969            return sb.toString();
1970        }
1971    }
1972}
1973