1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 *************************************************************************** 6 * Copyright (C) 2008-2016 International Business Machines Corporation 7 * and others. All Rights Reserved. 8 *************************************************************************** 9 * 10 * Unicode Spoof Detection 11 */ 12 13package android.icu.text; 14 15import java.io.IOException; 16import java.io.LineNumberReader; 17import java.io.Reader; 18import java.nio.ByteBuffer; 19import java.text.ParseException; 20import java.util.ArrayList; 21import java.util.Arrays; 22import java.util.BitSet; 23import java.util.Collections; 24import java.util.Comparator; 25import java.util.HashSet; 26import java.util.Hashtable; 27import java.util.LinkedHashSet; 28import java.util.Locale; 29import java.util.MissingResourceException; 30import java.util.Set; 31import java.util.Vector; 32import java.util.regex.Matcher; 33import java.util.regex.Pattern; 34 35import android.icu.impl.ICUBinary; 36import android.icu.impl.ICUBinary.Authenticate; 37import android.icu.impl.Utility; 38import android.icu.lang.UCharacter; 39import android.icu.lang.UCharacterCategory; 40import android.icu.lang.UProperty; 41import android.icu.lang.UScript; 42import android.icu.util.ULocale; 43 44/** 45 * <p> 46 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and 47 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: 48 * 49 * <ol> 50 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desordenado" and 51 * "ԁеѕогԁепаԁо".</li> 52 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof 53 * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li> 54 * </ol> 55 * 56 * <p> 57 * Although originally designed as a method for flagging suspicious identifier strings such as URLs, 58 * <code>SpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word 59 * content filters. 60 * 61 * <h2>Confusables</h2> 62 * 63 * <p> 64 * The following example shows how to use <code>SpoofChecker</code> to check for confusability between two strings: 65 * 66 * <pre> 67 * <code> 68 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 69 * int result = sc.areConfusable("desordenado", "ԁеѕогԁепаԁо"); 70 * System.out.println(result != 0); // true 71 * </code> 72 * </pre> 73 * 74 * <p> 75 * <code>SpoofChecker</code> uses a builder paradigm: options are specified within the context of a lightweight 76 * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading 77 * operations are performed, and an immutable <code>SpoofChecker</code> is returned. 78 * 79 * <p> 80 * The first line of the example creates a <code>SpoofChecker</code> object with confusable-checking enabled; the second 81 * line performs the confusability test. For best performance, the instance should be created once (e.g., upon 82 * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime. 83 * 84 * <p> 85 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a 86 * sequence of families of confusable characters, where each family has a single exemplar character. 87 * {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is 88 * equivalent to the example above: 89 * 90 * <pre> 91 * <code> 92 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 93 * boolean result = sc.getSkeleton("desordenado").equals(sc.getSkeleton("ԁеѕогԁепаԁо")); 94 * System.out.println(result); // true 95 * </code> 96 * </pre> 97 * 98 * <p> 99 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling 100 * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as 101 * shown below: 102 * 103 * <pre> 104 * // Setup: 105 * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example 106 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 107 * HashSet<String> skeletons = new HashSet<String>(); 108 * for (String word : DICTIONARY) { 109 * skeletons.add(sc.getSkeleton(word)); 110 * } 111 * 112 * // Live Check: 113 * boolean result = skeletons.contains(sc.getSkeleton("1orern")); 114 * System.out.println(result); // true 115 * </pre> 116 * 117 * <p> 118 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> 119 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons 120 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. 121 * 122 * <h2>Spoof Detection</h2> 123 * 124 * <p> 125 * The following snippet shows a minimal example of using <code>SpoofChecker</code> to perform spoof detection on a 126 * string: 127 * 128 * <pre> 129 * SpoofChecker sc = new SpoofChecker.Builder() 130 * .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION)) 131 * .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE) 132 * .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE) 133 * .build(); 134 * boolean result = sc.failsChecks("pаypаl"); // with Cyrillic 'а' characters 135 * System.out.println(result); // true 136 * </pre> 137 * 138 * <p> 139 * As in the case for confusability checking, it is good practice to create one <code>SpoofChecker</code> instance at 140 * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of 141 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the 142 * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the 143 * instance to perform confusability checking. 144 * 145 * <p> 146 * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}: 147 * 148 * <pre> 149 * <code> 150 * SpoofChecker sc = new SpoofChecker.Builder() 151 * .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION)) 152 * .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE) 153 * .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE) 154 * .build(); 155 * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult(); 156 * boolean result = sc.failsChecks("pаypаl", checkResult); 157 * System.out.println(checkResult.checks); // 16 158 * </code> 159 * </pre> 160 * 161 * <p> 162 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: 163 * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: 164 * 165 * <ul> 166 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the 167 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS 168 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> 169 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character 170 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> 171 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable 172 * characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.</li> 173 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> 174 * </ul> 175 * 176 * <p> 177 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the 178 * INVISIBLE and MIXED_NUMBERS conditions, you could do: 179 * 180 * <pre> 181 * <code> 182 * SpoofChecker sc = new SpoofChecker.Builder() 183 * .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS) 184 * .build(); 185 * boolean result = sc.failsChecks("৪8"); 186 * System.out.println(result); // true 187 * </code> 188 * </pre> 189 * 190 * <p> 191 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in 192 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings 193 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have 194 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is 195 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed 196 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on 197 * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of 198 * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code 199 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple 200 * scripts. 201 * 202 * <h2>Additional Information</h2> 203 * 204 * <p> 205 * A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. 206 * 207 * <p> 208 * <b>Thread Safety:</b> The methods on <code>SpoofChecker</code> objects are thread safe. The test functions for 209 * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called 210 * concurrently from multiple threads using the same <code>SpoofChecker</code> instance. 211 * 212 * @hide Only a subset of ICU is exposed in Android 213 */ 214public class SpoofChecker { 215 216 /** 217 * Constants from UTS 39 for use in setRestrictionLevel. 218 */ 219 public enum RestrictionLevel { 220 /** 221 * All characters in the string are in the identifier profile and all characters in the string are in the ASCII 222 * range. 223 */ 224 ASCII, 225 /** 226 * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the 227 * string is single-script, according to the definition in UTS 39 section 5.1. 228 */ 229 SINGLE_SCRIPT_RESTRICTIVE, 230 /** 231 * The string classifies as Single Script, or all characters in the string are in the identifier profile and the 232 * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1: 233 * <ul> 234 * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> 235 * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> 236 * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> 237 * </ul> 238 */ 239 HIGHLY_RESTRICTIVE, 240 /** 241 * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile 242 * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, 243 * Greek, and Cherokee. 244 */ 245 MODERATELY_RESTRICTIVE, 246 /** 247 * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as 248 * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. 249 */ 250 MINIMALLY_RESTRICTIVE, 251 /** 252 * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org 253 */ 254 UNRESTRICTIVE, 255 } 256 257 /** 258 * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. 259 * 260 * @hide draft / provisional / internal are hidden on Android 261 */ 262 public static final UnicodeSet INCLUSION = new UnicodeSet( 263 "['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C\\u200D\\u2010\\u" 264 + "2019\\u2027\\u30A0\\u30FB]").freeze(); 265 // Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt 266 // There is tooling to generate this constant in the unicodetools project: 267 // org.unicode.text.tools.RecommendedSetGenerator 268 // It will print the Java and C++ code to the console for easy copy-paste into this file. 269 270 /** 271 * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. 272 * 273 * @hide draft / provisional / internal are hidden on Android 274 */ 275 public static final UnicodeSet RECOMMENDED = new UnicodeSet( 276 "[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u014" 277 + "8\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E" 278 + "6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02B" 279 + "C\\u02EC\\u0300-\\u0304\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u03" 280 + "28\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386" 281 + "\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-\\u0529\\u05" 282 + "2E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0" 283 + "620-\\u063F\\u0641-\\u0655\\u0660-\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-" 284 + "\\u06D3\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2" 285 + "\\u08B6-\\u08BD\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096" 286 + "F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u0" 287 + "9A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE\\u" 288 + "09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-" 289 + "\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\" 290 + "u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A9" 291 + "3-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0" 292 + "ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\" 293 + "u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47" 294 + "\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83" 295 + "\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3" 296 + "\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0B" 297 + "D0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u" 298 + "0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56" 299 + "\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92" 300 + "-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0" 301 + "CD5\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D02\\u0D03\\u0D05-\\u0D0C\\u0" 302 + "D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-\\u0D57" 303 + "\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D9" 304 + "6\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0" 305 + "DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\" 306 + "u0E59\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u" 307 + "0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD\\" 308 + "u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00\\u0F20-\\u0F29" 309 + "\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F" 310 + "56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0" 311 + "F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6" 312 + "\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10" 313 + "C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u" 314 + "1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2" 315 + "-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1" 316 + "315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-" 317 + "\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1C80-\\u1C88\\u1E00-\\u1E9" 318 + "9\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1" 319 + "F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F" 320 + "7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1" 321 + "FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-" 322 + "\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0" 323 + "-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u3" 324 + "005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-\\u30FA\\u30FC-\\u30FE\\u" 325 + "3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FD5\\uA660\\uA661\\uA674-\\uA67B" 326 + "\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D\\uA78E\\uA790-\\uA793\\uA7A0-\\uA7AA\\uA7AE" 327 + "\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB" 328 + "11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13\\uF" 329 + "A14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00020000-\\U0002A6D6\\U0002A700-\\U0" 330 + "002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1]").freeze(); 331 // Note: data from http://unicode.org/Public/security/9.0.0/IdentifierStatus.txt 332 // There is tooling to generate this constant in the unicodetools project: 333 // org.unicode.text.tools.RecommendedSetGenerator 334 // It will print the Java and C++ code to the console for easy copy-paste into this file. 335 336 /** 337 * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of 338 * checks that will be performed, and to report results from the check function. 339 * 340 */ 341 342 /** 343 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 344 * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section 345 * 4. 346 */ 347 public static final int SINGLE_SCRIPT_CONFUSABLE = 1; 348 349 /** 350 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 351 * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS 352 * 39 section 4. 353 */ 354 public static final int MIXED_SCRIPT_CONFUSABLE = 2; 355 356 /** 357 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 358 * that the two strings are visually confusable and that they are not from the same script but both of them are 359 * single-script strings, according to UTS 39 section 4. 360 */ 361 public static final int WHOLE_SCRIPT_CONFUSABLE = 4; 362 363 /** 364 * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the 365 * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make 366 * {@link SpoofChecker#areConfusable} return only those types of confusables. 367 * 368 * @hide draft / provisional / internal are hidden on Android 369 */ 370 public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE; 371 372 /** 373 * This flag is deprecated and no longer affects the behavior of SpoofChecker. 374 * 375 * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was 376 * deprecated. 377 */ 378 @Deprecated 379 public static final int ANY_CASE = 8; 380 381 /** 382 * Check that an identifier satisfies the requirements for the restriction level specified in 383 * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is 384 * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. 385 * 386 * @hide draft / provisional / internal are hidden on Android 387 */ 388 public static final int RESTRICTION_LEVEL = 16; 389 390 /** 391 * Check that an identifier contains only characters from a single script (plus chars from the common and inherited 392 * scripts.) Applies to checks of a single identifier check only. 393 * 394 * @deprecated ICU 51 Use RESTRICTION_LEVEL 395 */ 396 @Deprecated 397 public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL; 398 399 /** 400 * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences 401 * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not 402 * test the input string as a whole for conformance to any particular syntax for identifiers. 403 */ 404 public static final int INVISIBLE = 32; 405 406 /** 407 * Check that an identifier contains only characters from a specified set of acceptable characters. See 408 * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check 409 * will also fail the {@link #RESTRICTION_LEVEL} check. 410 */ 411 public static final int CHAR_LIMIT = 64; 412 413 /** 414 * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39 415 * section 5.3. 416 * 417 * @hide draft / provisional / internal are hidden on Android 418 */ 419 public static final int MIXED_NUMBERS = 128; 420 421 // Update CheckResult.toString() when a new check is added. 422 423 /** 424 * Enable all spoof checks. 425 */ 426 public static final int ALL_CHECKS = 0xFFFFFFFF; 427 428 // Used for checking for ASCII-Only restriction level 429 static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze(); 430 431 /** 432 * private constructor: a SpoofChecker has to be built by the builder 433 */ 434 private SpoofChecker() { 435 } 436 437 /** 438 * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired 439 * checking options on the builder, then call the build() function to create a SpoofChecker instance. 440 */ 441 public static class Builder { 442 int fChecks; // Bit vector of checks to perform. 443 SpoofData fSpoofData; 444 final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters. 445 // for this Spoof Checker. Defaults to all chars. 446 final Set<ULocale> fAllowedLocales = new LinkedHashSet<ULocale>(); // The list of allowed locales. 447 private RestrictionLevel fRestrictionLevel; 448 449 /** 450 * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for 451 * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes 452 * to the default checking behavior. 453 */ 454 public Builder() { 455 fChecks = ALL_CHECKS; 456 fSpoofData = null; 457 fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE; 458 } 459 460 /** 461 * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker. 462 * 463 * @param src 464 * The existing checker. 465 */ 466 public Builder(SpoofChecker src) { 467 fChecks = src.fChecks; 468 fSpoofData = src.fSpoofData; // For the data, we will either use the source data 469 // as-is, or drop the builder's reference to it 470 // and generate new data, depending on what our 471 // caller does with the builder. 472 fAllowedCharsSet.set(src.fAllowedCharsSet); 473 fAllowedLocales.addAll(src.fAllowedLocales); 474 fRestrictionLevel = src.fRestrictionLevel; 475 } 476 477 /** 478 * Create a SpoofChecker with current configuration. 479 * 480 * @return SpoofChecker 481 */ 482 public SpoofChecker build() { 483 // TODO: Make this data loading be lazy (see #12696). 484 if (fSpoofData == null) { 485 // read binary file 486 fSpoofData = SpoofData.getDefault(); 487 } 488 489 // Copy all state from the builder to the new SpoofChecker. 490 // Make sure that everything is either cloned or copied, so 491 // that subsequent re-use of the builder won't modify the built 492 // SpoofChecker. 493 // 494 // One exception to this: the SpoofData is just assigned. 495 // If the builder subsequently needs to modify fSpoofData 496 // it will create a new SpoofData object first. 497 498 SpoofChecker result = new SpoofChecker(); 499 result.fChecks = this.fChecks; 500 result.fSpoofData = this.fSpoofData; 501 result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone()); 502 result.fAllowedCharsSet.freeze(); 503 result.fAllowedLocales = new HashSet<ULocale>(this.fAllowedLocales); 504 result.fRestrictionLevel = this.fRestrictionLevel; 505 return result; 506 } 507 508 /** 509 * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file 510 * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for 511 * these files, and the content of these files is acceptable input. 512 * 513 * @param confusables 514 * the Reader of confusable characters definitions, as found in file confusables.txt from 515 * unicode.org. 516 * @throws ParseException 517 * To report syntax errors in the input. 518 * 519 * @hide draft / provisional / internal are hidden on Android 520 */ 521 public Builder setData(Reader confusables) throws ParseException, IOException { 522 523 // Compile the binary data from the source (text) format. 524 // Drop the builder's reference to any pre-existing data, which may 525 // be in use in an already-built checker. 526 527 fSpoofData = new SpoofData(); 528 ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData); 529 return this; 530 } 531 532 /** 533 * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead. 534 * 535 * @param confusables 536 * the Reader of confusable characters definitions, as found in file confusables.txt from 537 * unicode.org. 538 * @param confusablesWholeScript 539 * No longer supported. 540 * @throws ParseException 541 * To report syntax errors in the input. 542 * 543 * @deprecated ICU 58 544 */ 545 @Deprecated 546 public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException { 547 setData(confusables); 548 return this; 549 } 550 551 /** 552 * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method 553 * overwrites any checks that may have already been enabled. By default, all checks are enabled. 554 * 555 * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For 556 * example, to fail strings containing characters outside of the set specified by {@link #setAllowedChars} and 557 * also strings that contain digits from mixed numbering systems: 558 * 559 * <pre> 560 * {@code 561 * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS); 562 * } 563 * </pre> 564 * 565 * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from 566 * ALL_CHECKS. For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality, 567 * it is good practice to disable the CONFUSABLE check: 568 * 569 * <pre> 570 * {@code 571 * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE); 572 * } 573 * </pre> 574 * 575 * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and 576 * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they 577 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those 578 * methods. 579 * 580 * @param checks 581 * The set of checks that this spoof checker will perform. The value is an 'or' of the desired 582 * checks. 583 * @return self 584 */ 585 public Builder setChecks(int checks) { 586 // Verify that the requested checks are all ones (bits) that 587 // are acceptable, known values. 588 if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) { 589 throw new IllegalArgumentException("Bad Spoof Checks value."); 590 } 591 this.fChecks = (checks & SpoofChecker.ALL_CHECKS); 592 return this; 593 } 594 595 /** 596 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages 597 * associated with the specified locales. Any previously specified list of locales is replaced by the new 598 * settings. 599 * 600 * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is 601 * determined. Characters from this set of scripts, along with characters from the "common" and "inherited" 602 * Unicode Script categories will be permitted. 603 * 604 * Supplying an empty string removes all restrictions; characters from any script will be allowed. 605 * 606 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a 607 * non-empty list of locales. 608 * 609 * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function. 610 * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters. 611 * 612 * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of 613 * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with 614 * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}. 615 * 616 * @param locales 617 * A Set of ULocales, from which the language and associated script are extracted. If the locales Set 618 * is null, no restrictions will be placed on the allowed characters. 619 * 620 * @return self 621 */ 622 public Builder setAllowedLocales(Set<ULocale> locales) { 623 fAllowedCharsSet.clear(); 624 625 for (ULocale locale : locales) { 626 // Add the script chars for this locale to the accumulating set 627 // of allowed chars. 628 addScriptChars(locale, fAllowedCharsSet); 629 } 630 631 // If our caller provided an empty list of locales, we disable the 632 // allowed characters checking 633 fAllowedLocales.clear(); 634 if (locales.size() == 0) { 635 fAllowedCharsSet.add(0, 0x10ffff); 636 fChecks &= ~CHAR_LIMIT; 637 return this; 638 } 639 640 // Add all common and inherited characters to the set of allowed 641 // chars. 642 UnicodeSet tempSet = new UnicodeSet(); 643 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON); 644 fAllowedCharsSet.addAll(tempSet); 645 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED); 646 fAllowedCharsSet.addAll(tempSet); 647 648 // Store the updated spoof checker state. 649 fAllowedLocales.clear(); 650 fAllowedLocales.addAll(locales); 651 fChecks |= CHAR_LIMIT; 652 return this; 653 } 654 655 /** 656 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages 657 * associated with the specified locales. Any previously specified list of locales is replaced by the new 658 * settings. 659 * 660 * @param locales 661 * A Set of Locales, from which the language and associated script are extracted. If the locales Set 662 * is null, no restrictions will be placed on the allowed characters. 663 * 664 * @return self 665 */ 666 public Builder setAllowedJavaLocales(Set<Locale> locales) { 667 HashSet<ULocale> ulocales = new HashSet<ULocale>(locales.size()); 668 for (Locale locale : locales) { 669 ulocales.add(ULocale.forLocale(locale)); 670 } 671 return setAllowedLocales(ulocales); 672 } 673 674 // Add (union) to the UnicodeSet all of the characters for the scripts 675 // used for the specified locale. Part of the implementation of 676 // setAllowedLocales. 677 private void addScriptChars(ULocale locale, UnicodeSet allowedChars) { 678 int scripts[] = UScript.getCode(locale); 679 if (scripts != null) { 680 UnicodeSet tmpSet = new UnicodeSet(); 681 for (int i = 0; i < scripts.length; i++) { 682 tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]); 683 allowedChars.addAll(tmpSet); 684 } 685 } 686 // else it's an unknown script. 687 // Maybe they asked for the script of "zxx", which refers to no linguistic content. 688 // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU. 689 } 690 691 /** 692 * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit 693 * is is replaced by the new settings. This includes limits on characters that were set with the 694 * setAllowedLocales() function. Note that the RESTRICTED set is useful. 695 * 696 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function. 697 * 698 * @param chars 699 * A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by 700 * this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling 701 * this function. Note that this clears the allowedLocales set. 702 * @return self 703 */ 704 public Builder setAllowedChars(UnicodeSet chars) { 705 fAllowedCharsSet.set(chars); 706 fAllowedLocales.clear(); 707 fChecks |= CHAR_LIMIT; 708 return this; 709 } 710 711 /** 712 * Set the loosest restriction level allowed for strings. The default if this is not called is 713 * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and 714 * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are 715 * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}. 716 * 717 * @param restrictionLevel 718 * The loosest restriction level allowed. 719 * @return self 720 * @hide draft / provisional / internal are hidden on Android 721 */ 722 public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) { 723 fRestrictionLevel = restrictionLevel; 724 fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS; 725 return this; 726 } 727 728 /* 729 * ***************************************************************************** 730 * Internal classes for compililing confusable data into its binary (runtime) form. 731 * ***************************************************************************** 732 */ 733 // --------------------------------------------------------------------- 734 // 735 // buildConfusableData Compile the source confusable data, as defined by 736 // the Unicode data file confusables.txt, into the binary 737 // structures used by the confusable detector. 738 // 739 // The binary structures are described in uspoof_impl.h 740 // 741 // 1. parse the data, making a hash table mapping from a codepoint to a String. 742 // 743 // 2. Sort all of the strings encountered by length, since they will need to 744 // be stored in that order in the final string table. 745 // TODO: Sorting these strings by length is no longer needed since the removal of 746 // the string lengths table. This logic can be removed to save processing time 747 // when building confusables data. 748 // 749 // 3. Build a list of keys (UChar32s) from the mapping table. Sort the 750 // list because that will be the ordering of our runtime table. 751 // 752 // 4. Generate the run time string table. This is generated before the key & value 753 // table because we need the string indexes when building those tables. 754 // 755 // 5. Build the run-time key and value table. These are parallel tables, and 756 // are built at the same time 757 758 // class ConfusabledataBuilder 759 // An instance of this class exists while the confusable data is being built from source. 760 // It encapsulates the intermediate data structures that are used for building. 761 // It exports one static function, to do a confusable data build. 762 private static class ConfusabledataBuilder { 763 764 private Hashtable<Integer, SPUString> fTable; 765 private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the 766 // four mapping tables. 767 768 // The compiled data is first assembled into the following four collections, 769 // then output to the builder's SpoofData object. 770 private StringBuffer fStringTable; 771 private ArrayList<Integer> fKeyVec; 772 private ArrayList<Integer> fValueVec; 773 private SPUStringPool stringPool; 774 private Pattern fParseLine; 775 private Pattern fParseHexNum; 776 private int fLineNum; 777 778 ConfusabledataBuilder() { 779 fTable = new Hashtable<Integer, SPUString>(); 780 fKeySet = new UnicodeSet(); 781 fKeyVec = new ArrayList<Integer>(); 782 fValueVec = new ArrayList<Integer>(); 783 stringPool = new SPUStringPool(); 784 } 785 786 void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException { 787 StringBuffer fInput = new StringBuffer(); 788 789 // Convert the user input data from UTF-8 to char (UTF-16) 790 LineNumberReader lnr = new LineNumberReader(confusables); 791 do { 792 String line = lnr.readLine(); 793 if (line == null) { 794 break; 795 } 796 fInput.append(line); 797 fInput.append('\n'); 798 } while (true); 799 800 // Regular Expression to parse a line from Confusables.txt. The expression will match 801 // any line. What was matched is determined by examining which capture groups have a match. 802 // Capture Group 1: the source char 803 // Capture Group 2: the replacement chars 804 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated) 805 // Capture Group 7: A blank or comment only line. 806 // Capture Group 8: A syntactically invalid line. Anything that didn't match before. 807 // Example Line from the confusables.txt source file: 808 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " 809 fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char 810 "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s) 811 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued) 812 "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type 813 "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment 814 "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment 815 "|^(.*?)$"); // OR match any line, which catches illegal lines. 816 817 // Regular expression for parsing a hex number out of a space-separated list of them. 818 // Capture group 1 gets the number, with spaces removed. 819 fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)"); 820 821 // Zap any Byte Order Mark at the start of input. Changing it to a space 822 // is benign given the syntax of the input. 823 if (fInput.charAt(0) == 0xfeff) { 824 fInput.setCharAt(0, (char) 0x20); 825 } 826 827 // Parse the input, one line per iteration of this loop. 828 Matcher matcher = fParseLine.matcher(fInput); 829 while (matcher.find()) { 830 fLineNum++; 831 if (matcher.start(7) >= 0) { 832 // this was a blank or comment line. 833 continue; 834 } 835 if (matcher.start(8) >= 0) { 836 // input file syntax error. 837 // status = U_PARSE_ERROR; 838 throw new ParseException( 839 "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8), 840 matcher.start(8)); 841 } 842 843 // We have a good input line. Extract the key character and mapping 844 // string, and 845 // put them into the appropriate mapping table. 846 int keyChar = Integer.parseInt(matcher.group(1), 16); 847 if (keyChar > 0x10ffff) { 848 throw new ParseException( 849 "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1), 850 matcher.start(1)); 851 } 852 Matcher m = fParseHexNum.matcher(matcher.group(2)); 853 854 StringBuilder mapString = new StringBuilder(); 855 while (m.find()) { 856 int c = Integer.parseInt(m.group(1), 16); 857 if (c > 0x10ffff) { 858 throw new ParseException( 859 "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16), 860 matcher.start(2)); 861 } 862 mapString.appendCodePoint(c); 863 } 864 assert (mapString.length() >= 1); 865 866 // Put the map (value) string into the string pool 867 // This a little like a Java intern() - any duplicates will be 868 // eliminated. 869 SPUString smapString = stringPool.addString(mapString.toString()); 870 871 // Add the char . string mapping to the table. 872 // For Unicode 8, the SL, SA and ML tables have been discontinued. 873 // All input data from confusables.txt is tagged MA. 874 fTable.put(keyChar, smapString); 875 876 fKeySet.add(keyChar); 877 } 878 879 // Input data is now all parsed and collected. 880 // Now create the run-time binary form of the data. 881 // 882 // This is done in two steps. First the data is assembled into vectors and strings, 883 // for ease of construction, then the contents of these collections are copied 884 // into the actual SpoofData object. 885 886 // Build up the string array, and record the index of each string therein 887 // in the (build time only) string pool. 888 // Strings of length one are not entered into the strings array. 889 // (Strings in the table are sorted by length) 890 891 stringPool.sort(); 892 fStringTable = new StringBuffer(); 893 int poolSize = stringPool.size(); 894 int i; 895 for (i = 0; i < poolSize; i++) { 896 SPUString s = stringPool.getByIndex(i); 897 int strLen = s.fStr.length(); 898 int strIndex = fStringTable.length(); 899 if (strLen == 1) { 900 // strings of length one do not get an entry in the string table. 901 // Keep the single string character itself here, which is the same 902 // convention that is used in the final run-time string table index. 903 s.fCharOrStrTableIndex = s.fStr.charAt(0); 904 } else { 905 s.fCharOrStrTableIndex = strIndex; 906 fStringTable.append(s.fStr); 907 } 908 } 909 910 // Construct the compile-time Key and Value table. 911 // 912 // The keys in the Key table follow the format described in uspoof.h for the 913 // Cfu confusables data structure. 914 // 915 // Starting in ICU 58, each code point has exactly one entry in the data 916 // structure. 917 918 for (String keyCharStr : fKeySet) { 919 int keyChar = keyCharStr.codePointAt(0); 920 SPUString targetMapping = fTable.get(keyChar); 921 assert targetMapping != null; 922 923 // Throw a sane exception if trying to consume a long string. Otherwise, 924 // codePointAndLengthToKey will throw an assertion error. 925 if (targetMapping.fStr.length() > 256) { 926 throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries."); 927 } 928 929 int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length()); 930 int value = targetMapping.fCharOrStrTableIndex; 931 932 fKeyVec.add(key); 933 fValueVec.add(value); 934 } 935 936 // Put the assembled data into the destination SpoofData object. 937 938 // The Key Table 939 // While copying the keys to the output array, 940 // also sanity check that the keys are sorted. 941 int numKeys = fKeyVec.size(); 942 dest.fCFUKeys = new int[numKeys]; 943 int previousCodePoint = 0; 944 for (i = 0; i < numKeys; i++) { 945 int key = fKeyVec.get(i); 946 int codePoint = ConfusableDataUtils.keyToCodePoint(key); 947 // strictly greater because there can be only one entry per code point 948 assert codePoint > previousCodePoint; 949 dest.fCFUKeys[i] = key; 950 previousCodePoint = codePoint; 951 } 952 953 // The Value Table, parallels the key table 954 int numValues = fValueVec.size(); 955 assert (numKeys == numValues); 956 dest.fCFUValues = new short[numValues]; 957 i = 0; 958 for (int value : fValueVec) { 959 assert (value < 0xffff); 960 dest.fCFUValues[i++] = (short) value; 961 } 962 963 // The Strings Table. 964 dest.fCFUStrings = fStringTable.toString(); 965 } 966 967 public static void buildConfusableData(Reader confusables, SpoofData dest) 968 throws java.io.IOException, ParseException { 969 ConfusabledataBuilder builder = new ConfusabledataBuilder(); 970 builder.build(confusables, dest); 971 } 972 973 /* 974 * ***************************************************************************** 975 * Internal classes for compiling confusable data into its binary (runtime) form. 976 * ***************************************************************************** 977 */ 978 // SPUString 979 // Holds a string that is the result of one of the mappings defined 980 // by the confusable mapping data (confusables.txt from Unicode.org) 981 // Instances of SPUString exist during the compilation process only. 982 983 private static class SPUString { 984 String fStr; // The actual string. 985 int fCharOrStrTableIndex; // Index into the final runtime data for this string. 986 // (or, for length 1, the single string char itself, 987 // there being no string table entry for it.) 988 989 SPUString(String s) { 990 fStr = s; 991 fCharOrStrTableIndex = 0; 992 } 993 } 994 995 // Comparison function for ordering strings in the string pool. 996 // Compare by length first, then, within a group of the same length, 997 // by code point order. 998 999 private static class SPUStringComparator implements Comparator<SPUString> { 1000 @Override 1001 public int compare(SPUString sL, SPUString sR) { 1002 int lenL = sL.fStr.length(); 1003 int lenR = sR.fStr.length(); 1004 if (lenL < lenR) { 1005 return -1; 1006 } else if (lenL > lenR) { 1007 return 1; 1008 } else { 1009 return sL.fStr.compareTo(sR.fStr); 1010 } 1011 } 1012 1013 final static SPUStringComparator INSTANCE = new SPUStringComparator(); 1014 } 1015 1016 // String Pool A utility class for holding the strings that are the result of 1017 // the spoof mappings. These strings will utimately end up in the 1018 // run-time String Table. 1019 // This is sort of like a sorted set of strings, except that ICU's anemic 1020 // built-in collections don't support those, so it is implemented with a 1021 // combination of a uhash and a Vector. 1022 private static class SPUStringPool { 1023 public SPUStringPool() { 1024 fVec = new Vector<SPUString>(); 1025 fHash = new Hashtable<String, SPUString>(); 1026 } 1027 1028 public int size() { 1029 return fVec.size(); 1030 } 1031 1032 // Get the n-th string in the collection. 1033 public SPUString getByIndex(int index) { 1034 SPUString retString = fVec.elementAt(index); 1035 return retString; 1036 } 1037 1038 // Add a string. Return the string from the table. 1039 // If the input parameter string is already in the table, delete the 1040 // input parameter and return the existing string. 1041 public SPUString addString(String src) { 1042 SPUString hashedString = fHash.get(src); 1043 if (hashedString == null) { 1044 hashedString = new SPUString(src); 1045 fHash.put(src, hashedString); 1046 fVec.addElement(hashedString); 1047 } 1048 return hashedString; 1049 } 1050 1051 // Sort the contents; affects the ordering of getByIndex(). 1052 public void sort() { 1053 Collections.sort(fVec, SPUStringComparator.INSTANCE); 1054 } 1055 1056 private Vector<SPUString> fVec; // Elements are SPUString * 1057 private Hashtable<String, SPUString> fHash; // Key: Value: 1058 } 1059 1060 } 1061 } 1062 1063 /** 1064 * Get the Restriction Level that is being tested. 1065 * 1066 * @return The restriction level 1067 * @deprecated This API is ICU internal only. 1068 * @hide draft / provisional / internal are hidden on Android 1069 */ 1070 @Deprecated 1071 public RestrictionLevel getRestrictionLevel() { 1072 return fRestrictionLevel; 1073 } 1074 1075 /** 1076 * Get the set of checks that this Spoof Checker has been configured to perform. 1077 * 1078 * @return The set of checks that this spoof checker will perform. 1079 */ 1080 public int getChecks() { 1081 return fChecks; 1082 } 1083 1084 /** 1085 * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on 1086 * scripts have been specified, an empty set will be returned. 1087 * 1088 * setAllowedChars() will reset the list of allowed locales to be empty. 1089 * 1090 * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales(); 1091 * the information other than languages from the originally specified locales may be omitted. 1092 * 1093 * @return A set of locales corresponding to the acceptable scripts. 1094 */ 1095 public Set<ULocale> getAllowedLocales() { 1096 return Collections.unmodifiableSet(fAllowedLocales); 1097 } 1098 1099 /** 1100 * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If 1101 * no limitations on scripts have been specified, an empty set will be returned. 1102 * 1103 * @return A set of locales corresponding to the acceptable scripts. 1104 */ 1105 public Set<Locale> getAllowedJavaLocales() { 1106 HashSet<Locale> locales = new HashSet<Locale>(fAllowedLocales.size()); 1107 for (ULocale uloc : fAllowedLocales) { 1108 locales.add(uloc.toLocale()); 1109 } 1110 return locales; 1111 } 1112 1113 /** 1114 * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set 1115 * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by 1116 * this function. 1117 * 1118 * The returned set will be frozen, meaning that it cannot be modified by the caller. 1119 * 1120 * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test. 1121 */ 1122 public UnicodeSet getAllowedChars() { 1123 return fAllowedCharsSet; 1124 } 1125 1126 /** 1127 * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed. 1128 */ 1129 public static class CheckResult { 1130 /** 1131 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests 1132 * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on. 1133 * 1134 * @see Builder#setChecks 1135 */ 1136 public int checks; 1137 1138 /** 1139 * The index of the first string position that failed a check. 1140 * 1141 * @deprecated ICU 51. No longer supported. Always set to zero. 1142 */ 1143 @Deprecated 1144 public int position; 1145 1146 /** 1147 * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null. The set will contain the zero 1148 * digit from each decimal number system found in the input string. 1149 * 1150 * @hide draft / provisional / internal are hidden on Android 1151 */ 1152 public UnicodeSet numerics; 1153 1154 /** 1155 * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null. 1156 * 1157 * @hide draft / provisional / internal are hidden on Android 1158 */ 1159 public RestrictionLevel restrictionLevel; 1160 1161 /** 1162 * Default constructor 1163 */ 1164 public CheckResult() { 1165 checks = 0; 1166 position = 0; 1167 } 1168 1169 /** 1170 * {@inheritDoc} 1171 */ 1172 @Override 1173 public String toString() { 1174 StringBuilder sb = new StringBuilder(); 1175 sb.append("checks:"); 1176 if (checks == 0) { 1177 sb.append(" none"); 1178 } else if (checks == ALL_CHECKS) { 1179 sb.append(" all"); 1180 } else { 1181 if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) { 1182 sb.append(" SINGLE_SCRIPT_CONFUSABLE"); 1183 } 1184 if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) { 1185 sb.append(" MIXED_SCRIPT_CONFUSABLE"); 1186 } 1187 if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) { 1188 sb.append(" WHOLE_SCRIPT_CONFUSABLE"); 1189 } 1190 if ((checks & ANY_CASE) != 0) { 1191 sb.append(" ANY_CASE"); 1192 } 1193 if ((checks & RESTRICTION_LEVEL) != 0) { 1194 sb.append(" RESTRICTION_LEVEL"); 1195 } 1196 if ((checks & INVISIBLE) != 0) { 1197 sb.append(" INVISIBLE"); 1198 } 1199 if ((checks & CHAR_LIMIT) != 0) { 1200 sb.append(" CHAR_LIMIT"); 1201 } 1202 if ((checks & MIXED_NUMBERS) != 0) { 1203 sb.append(" MIXED_NUMBERS"); 1204 } 1205 } 1206 sb.append(", numerics: ").append(numerics.toPattern(false)); 1207 sb.append(", position: ").append(position); 1208 sb.append(", restrictionLevel: ").append(restrictionLevel); 1209 return sb.toString(); 1210 } 1211 } 1212 1213 /** 1214 * Check the specified string for possible security issues. The text to be checked will typically be an identifier 1215 * of some sort. The set of checks to be performed was specified when building the SpoofChecker. 1216 * 1217 * @param text 1218 * A String to be checked for possible security issues. 1219 * @param checkResult 1220 * Output parameter, indicates which specific tests failed. May be null if the information is not wanted. 1221 * @return True there any issue is found with the input string. 1222 */ 1223 public boolean failsChecks(String text, CheckResult checkResult) { 1224 int length = text.length(); 1225 1226 int result = 0; 1227 if (checkResult != null) { 1228 checkResult.position = 0; 1229 checkResult.numerics = null; 1230 checkResult.restrictionLevel = null; 1231 } 1232 1233 if (0 != (this.fChecks & RESTRICTION_LEVEL)) { 1234 RestrictionLevel textRestrictionLevel = getRestrictionLevel(text); 1235 if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) { 1236 result |= RESTRICTION_LEVEL; 1237 } 1238 if (checkResult != null) { 1239 checkResult.restrictionLevel = textRestrictionLevel; 1240 } 1241 } 1242 1243 if (0 != (this.fChecks & MIXED_NUMBERS)) { 1244 UnicodeSet numerics = new UnicodeSet(); 1245 getNumerics(text, numerics); 1246 if (numerics.size() > 1) { 1247 result |= MIXED_NUMBERS; 1248 } 1249 if (checkResult != null) { 1250 checkResult.numerics = numerics; 1251 } 1252 } 1253 1254 if (0 != (this.fChecks & CHAR_LIMIT)) { 1255 int i; 1256 int c; 1257 for (i = 0; i < length;) { 1258 // U16_NEXT(text, i, length, c); 1259 c = Character.codePointAt(text, i); 1260 i = Character.offsetByCodePoints(text, i, 1); 1261 if (!this.fAllowedCharsSet.contains(c)) { 1262 result |= CHAR_LIMIT; 1263 break; 1264 } 1265 } 1266 } 1267 1268 if (0 != (this.fChecks & INVISIBLE)) { 1269 // This check needs to be done on NFD input 1270 String nfdText = nfdNormalizer.normalize(text); 1271 1272 // scan for more than one occurrence of the same non-spacing mark 1273 // in a sequence of non-spacing marks. 1274 int i; 1275 int c; 1276 int firstNonspacingMark = 0; 1277 boolean haveMultipleMarks = false; 1278 UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a 1279 // single combining sequence. 1280 for (i = 0; i < length;) { 1281 c = Character.codePointAt(nfdText, i); 1282 i = Character.offsetByCodePoints(nfdText, i, 1); 1283 if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) { 1284 firstNonspacingMark = 0; 1285 if (haveMultipleMarks) { 1286 marksSeenSoFar.clear(); 1287 haveMultipleMarks = false; 1288 } 1289 continue; 1290 } 1291 if (firstNonspacingMark == 0) { 1292 firstNonspacingMark = c; 1293 continue; 1294 } 1295 if (!haveMultipleMarks) { 1296 marksSeenSoFar.add(firstNonspacingMark); 1297 haveMultipleMarks = true; 1298 } 1299 if (marksSeenSoFar.contains(c)) { 1300 // report the error, and stop scanning. 1301 // No need to find more than the first failure. 1302 result |= INVISIBLE; 1303 break; 1304 } 1305 marksSeenSoFar.add(c); 1306 } 1307 } 1308 if (checkResult != null) { 1309 checkResult.checks = result; 1310 } 1311 return (0 != result); 1312 } 1313 1314 /** 1315 * Check the specified string for possible security issues. The text to be checked will typically be an identifier 1316 * of some sort. The set of checks to be performed was specified when building the SpoofChecker. 1317 * 1318 * @param text 1319 * A String to be checked for possible security issues. 1320 * @return True there any issue is found with the input string. 1321 */ 1322 public boolean failsChecks(String text) { 1323 return failsChecks(text, null); 1324 } 1325 1326 /** 1327 * Check the whether two specified strings are visually confusable. The types of confusability to be tested - single 1328 * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker. 1329 * 1330 * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE 1331 * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected. 1332 * 1333 * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case 1334 * folded for comparison and display to the user, do not select the ANY_CASE option. 1335 * 1336 * 1337 * @param s1 1338 * The first of the two strings to be compared for confusability. 1339 * @param s2 1340 * The second of the two strings to be compared for confusability. 1341 * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability 1342 * found, as defined by spoof check test constants. 1343 */ 1344 public int areConfusable(String s1, String s2) { 1345 // 1346 // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, 1347 // and for definitions of the types (single, whole, mixed-script) of confusables. 1348 1349 // We only care about a few of the check flags. Ignore the others. 1350 // If no tests relevant to this function have been specified, signal an error. 1351 // TODO: is this really the right thing to do? It's probably an error on 1352 // the caller's part, but logically we would just return 0 (no error). 1353 if ((this.fChecks & CONFUSABLE) == 0) { 1354 throw new IllegalArgumentException("No confusable checks are enabled."); 1355 } 1356 1357 // Compute the skeletons and check for confusability. 1358 String s1Skeleton = getSkeleton(s1); 1359 String s2Skeleton = getSkeleton(s2); 1360 if (!s1Skeleton.equals(s2Skeleton)) { 1361 return 0; 1362 } 1363 1364 // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes 1365 // of confusables according to UTS 39 section 4. 1366 // Start by computing the resolved script sets of s1 and s2. 1367 ScriptSet s1RSS = new ScriptSet(); 1368 getResolvedScriptSet(s1, s1RSS); 1369 ScriptSet s2RSS = new ScriptSet(); 1370 getResolvedScriptSet(s2, s2RSS); 1371 1372 // Turn on all applicable flags 1373 int result = 0; 1374 if (s1RSS.intersects(s2RSS)) { 1375 result |= SINGLE_SCRIPT_CONFUSABLE; 1376 } else { 1377 result |= MIXED_SCRIPT_CONFUSABLE; 1378 if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) { 1379 result |= WHOLE_SCRIPT_CONFUSABLE; 1380 } 1381 } 1382 1383 // Turn off flags that the user doesn't want 1384 result &= fChecks; 1385 1386 return result; 1387 } 1388 1389 /** 1390 * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are 1391 * confusable if their skeletons are identical. See Unicode UAX 39 for additional information. 1392 * 1393 * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some 1394 * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons. 1395 * 1396 * Skeletons are computed using the algorithm and data described in Unicode UAX 39. 1397 * 1398 * @param str 1399 * The input string whose skeleton will be generated. 1400 * @return The output skeleton string. 1401 * 1402 * @hide draft / provisional / internal are hidden on Android 1403 */ 1404 public String getSkeleton(CharSequence str) { 1405 // Apply the skeleton mapping to the NFD normalized input string 1406 // Accumulate the skeleton, possibly unnormalized, in a String. 1407 String nfdId = nfdNormalizer.normalize(str); 1408 int normalizedLen = nfdId.length(); 1409 StringBuilder skelSB = new StringBuilder(); 1410 for (int inputIndex = 0; inputIndex < normalizedLen;) { 1411 int c = Character.codePointAt(nfdId, inputIndex); 1412 inputIndex += Character.charCount(c); 1413 this.fSpoofData.confusableLookup(c, skelSB); 1414 } 1415 String skelStr = skelSB.toString(); 1416 skelStr = nfdNormalizer.normalize(skelStr); 1417 return skelStr; 1418 } 1419 1420 /** 1421 * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been 1422 * ignored, and starting with ICU 58, this function has been deprecated. 1423 * 1424 * @param type 1425 * No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA. 1426 * @param id 1427 * The input identifier whose skeleton will be generated. 1428 * @return The output skeleton string. 1429 * 1430 * @deprecated ICU 58 1431 */ 1432 @Deprecated 1433 public String getSkeleton(int type, String id) { 1434 return getSkeleton(id); 1435 } 1436 1437 /** 1438 * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have 1439 * enabled the same set of checks. 1440 * 1441 * @param other 1442 * the SpoofChecker being compared with. 1443 * @return true if the two SpoofCheckers are equal. 1444 * @hide draft / provisional / internal are hidden on Android 1445 */ 1446 @Override 1447 public boolean equals(Object other) { 1448 if (!(other instanceof SpoofChecker)) { 1449 return false; 1450 } 1451 SpoofChecker otherSC = (SpoofChecker) other; 1452 if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) { 1453 return false; 1454 } 1455 if (fChecks != otherSC.fChecks) { 1456 return false; 1457 } 1458 if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null 1459 && !fAllowedLocales.equals(otherSC.fAllowedLocales)) { 1460 return false; 1461 } 1462 if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null 1463 && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) { 1464 return false; 1465 } 1466 if (fRestrictionLevel != otherSC.fRestrictionLevel) { 1467 return false; 1468 } 1469 return true; 1470 } 1471 1472 /** 1473 * @hide draft / provisional / internal are hidden on Android 1474 */ 1475 @Override 1476 public int hashCode() { 1477 return fChecks 1478 ^ fSpoofData.hashCode() 1479 ^ fAllowedLocales.hashCode() 1480 ^ fAllowedCharsSet.hashCode() 1481 ^ fRestrictionLevel.ordinal(); 1482 } 1483 1484 /** 1485 * Computes the augmented script set for a code point, according to UTS 39 section 5.1. 1486 */ 1487 private static void getAugmentedScriptSet(int codePoint, ScriptSet result) { 1488 result.clear(); 1489 UScript.getScriptExtensions(codePoint, result); 1490 1491 // Section 5.1 step 1 1492 if (result.get(UScript.HAN)) { 1493 result.set(UScript.HAN_WITH_BOPOMOFO); 1494 result.set(UScript.JAPANESE); 1495 result.set(UScript.KOREAN); 1496 } 1497 if (result.get(UScript.HIRAGANA)) { 1498 result.set(UScript.JAPANESE); 1499 } 1500 if (result.get(UScript.KATAKANA)) { 1501 result.set(UScript.JAPANESE); 1502 } 1503 if (result.get(UScript.HANGUL)) { 1504 result.set(UScript.KOREAN); 1505 } 1506 if (result.get(UScript.BOPOMOFO)) { 1507 result.set(UScript.HAN_WITH_BOPOMOFO); 1508 } 1509 1510 // Section 5.1 step 2 1511 if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) { 1512 result.setAll(); 1513 } 1514 } 1515 1516 /** 1517 * Computes the resolved script set for a string, according to UTS 39 section 5.1. 1518 */ 1519 private void getResolvedScriptSet(CharSequence input, ScriptSet result) { 1520 getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result); 1521 } 1522 1523 /** 1524 * Computes the resolved script set for a string, omitting characters having the specified script. If 1525 * UScript.CODE_LIMIT is passed as the second argument, all characters are included. 1526 */ 1527 private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) { 1528 result.setAll(); 1529 1530 ScriptSet temp = new ScriptSet(); 1531 for (int utf16Offset = 0; utf16Offset < input.length();) { 1532 int codePoint = Character.codePointAt(input, utf16Offset); 1533 utf16Offset += Character.charCount(codePoint); 1534 1535 // Compute the augmented script set for the character 1536 getAugmentedScriptSet(codePoint, temp); 1537 1538 // Intersect the augmented script set with the resolved script set, but only if the character doesn't 1539 // have the script specified in the function call 1540 if (script == UScript.CODE_LIMIT || !temp.get(script)) { 1541 result.and(temp); 1542 } 1543 } 1544 } 1545 1546 /** 1547 * Computes the set of numerics for a string, according to UTS 39 section 5.3. 1548 */ 1549 private void getNumerics(String input, UnicodeSet result) { 1550 result.clear(); 1551 1552 for (int utf16Offset = 0; utf16Offset < input.length();) { 1553 int codePoint = Character.codePointAt(input, utf16Offset); 1554 utf16Offset += Character.charCount(codePoint); 1555 1556 // Store a representative character for each kind of decimal digit 1557 if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { 1558 // Store the zero character as a representative for comparison. 1559 // Unicode guarantees it is codePoint - value 1560 result.add(codePoint - UCharacter.getNumericValue(codePoint)); 1561 } 1562 } 1563 } 1564 1565 /** 1566 * Computes the restriction level of a string, according to UTS 39 section 5.2. 1567 */ 1568 private RestrictionLevel getRestrictionLevel(String input) { 1569 // Section 5.2 step 1: 1570 if (!fAllowedCharsSet.containsAll(input)) { 1571 return RestrictionLevel.UNRESTRICTIVE; 1572 } 1573 1574 // Section 5.2 step 2: 1575 if (ASCII.containsAll(input)) { 1576 return RestrictionLevel.ASCII; 1577 } 1578 1579 // Section 5.2 steps 3: 1580 ScriptSet resolvedScriptSet = new ScriptSet(); 1581 getResolvedScriptSet(input, resolvedScriptSet); 1582 1583 // Section 5.2 step 4: 1584 if (!resolvedScriptSet.isEmpty()) { 1585 return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE; 1586 } 1587 1588 // Section 5.2 step 5: 1589 ScriptSet resolvedNoLatn = new ScriptSet(); 1590 getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn); 1591 1592 // Section 5.2 step 6: 1593 if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE) 1594 || resolvedNoLatn.get(UScript.KOREAN)) { 1595 return RestrictionLevel.HIGHLY_RESTRICTIVE; 1596 } 1597 1598 // Section 5.2 step 7: 1599 if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK) 1600 && !resolvedNoLatn.get(UScript.CHEROKEE)) { 1601 return RestrictionLevel.MODERATELY_RESTRICTIVE; 1602 } 1603 1604 // Section 5.2 step 8: 1605 return RestrictionLevel.MINIMALLY_RESTRICTIVE; 1606 } 1607 1608 // Data Members 1609 private int fChecks; // Bit vector of checks to perform. 1610 private SpoofData fSpoofData; 1611 private Set<ULocale> fAllowedLocales; // The Set of allowed locales. 1612 private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. 1613 private RestrictionLevel fRestrictionLevel; 1614 1615 private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance(); 1616 1617 // Confusable Mappings Data Structures, version 2.0 1618 // 1619 // This description and the corresponding implementation are to be kept 1620 // in-sync with the copy in icu4c uspoof_impl.h. 1621 // 1622 // For the confusable data, we are essentially implementing a map, 1623 // key: a code point 1624 // value: a string. Most commonly one char in length, but can be more. 1625 // 1626 // The keys are stored as a sorted array of 32 bit ints. 1627 // bits 0-23 a code point value 1628 // bits 24-31 length of value string, in UChars (between 1 and 256 UChars). 1629 // The key table is sorted in ascending code point order. (not on the 1630 // 32 bit int value, the flag bits do not participate in the sorting.) 1631 // 1632 // Lookup is done by means of a binary search in the key table. 1633 // 1634 // The corresponding values are kept in a parallel array of 16 bit ints. 1635 // If the value string is of length 1, it is literally in the value array. 1636 // For longer strings, the value array contains an index into the strings 1637 // table. 1638 // 1639 // String Table: 1640 // The strings table contains all of the value strings (those of length two or greater) 1641 // concatentated together into one long char (UTF-16) array. 1642 // 1643 // There is no nul character or other mark between adjacent strings. 1644 // 1645 //---------------------------------------------------------------------------- 1646 // 1647 // Changes from format version 1 to format version 2: 1648 // 1) Removal of the whole-script confusable data tables. 1649 // 2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask. 1650 // 3) Expansion of string length value in the key bitmask from 2 bits to 8 bits. 1651 // 4) Removal of the string lengths table since 8 bits is sufficient for the 1652 // lengths of all entries in confusables.txt. 1653 // 1654 private static final class ConfusableDataUtils { 1655 public static final int FORMAT_VERSION = 2; // version for ICU 58 1656 1657 public static final int keyToCodePoint(int key) { 1658 return key & 0x00ffffff; 1659 } 1660 1661 public static final int keyToLength(int key) { 1662 return ((key & 0xff000000) >> 24) + 1; 1663 } 1664 1665 public static final int codePointAndLengthToKey(int codePoint, int length) { 1666 assert (codePoint & 0x00ffffff) == codePoint; 1667 assert length <= 256; 1668 return codePoint | ((length - 1) << 24); 1669 } 1670 } 1671 1672 // ------------------------------------------------------------------------------------- 1673 // 1674 // SpoofData 1675 // 1676 // This class corresponds to the ICU SpoofCheck data. 1677 // 1678 // The data can originate with the Binary ICU data that is generated in ICU4C, 1679 // or it can originate from source rules that are compiled in ICU4J. 1680 // 1681 // This class does not include the set of checks to be performed, but only 1682 // data that is serialized into the ICU binary data. 1683 // 1684 // Because Java cannot easily wrap binary data like ICU4C, the binary data is 1685 // copied into Java structures that are convenient for use by the run time code. 1686 // 1687 // --------------------------------------------------------------------------------------- 1688 private static class SpoofData { 1689 1690 // The Confusable data, Java data structures for. 1691 int[] fCFUKeys; 1692 short[] fCFUValues; 1693 String fCFUStrings; 1694 1695 private static final int DATA_FORMAT = 0x43667520; // "Cfu " 1696 1697 private static final class IsAcceptable implements Authenticate { 1698 @Override 1699 public boolean isDataVersionAcceptable(byte version[]) { 1700 return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0 1701 || version[3] != 0; 1702 } 1703 } 1704 1705 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 1706 1707 private static final class DefaultData { 1708 private static SpoofData INSTANCE = null; 1709 private static IOException EXCEPTION = null; 1710 1711 static { 1712 // Note: Although this is static, the Java runtime can delay execution of this block until 1713 // the data is actually requested via SpoofData.getDefault(). 1714 try { 1715 INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu")); 1716 } catch (IOException e) { 1717 EXCEPTION = e; 1718 } 1719 } 1720 } 1721 1722 /** 1723 * @return instance for Unicode standard data 1724 */ 1725 public static SpoofData getDefault() { 1726 if (DefaultData.EXCEPTION != null) { 1727 throw new MissingResourceException( 1728 "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(), 1729 "SpoofChecker", ""); 1730 } 1731 return DefaultData.INSTANCE; 1732 } 1733 1734 // SpoofChecker Data constructor for use from data builder. 1735 // Initializes a new, empty data area that will be populated later. 1736 private SpoofData() { 1737 } 1738 1739 // Constructor for use when creating from prebuilt default data. 1740 // A ByteBuffer is what the ICU internal data loading functions provide. 1741 private SpoofData(ByteBuffer bytes) throws java.io.IOException { 1742 ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE); 1743 bytes.mark(); 1744 readData(bytes); 1745 } 1746 1747 @Override 1748 public boolean equals(Object other) { 1749 if (!(other instanceof SpoofData)) { 1750 return false; 1751 } 1752 SpoofData otherData = (SpoofData) other; 1753 if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys)) 1754 return false; 1755 if (!Arrays.equals(fCFUValues, otherData.fCFUValues)) 1756 return false; 1757 if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null 1758 && !fCFUStrings.equals(otherData.fCFUStrings)) 1759 return false; 1760 return true; 1761 } 1762 1763 @Override 1764 public int hashCode() { 1765 return Arrays.hashCode(fCFUKeys) 1766 ^ Arrays.hashCode(fCFUValues) 1767 ^ fCFUStrings.hashCode(); 1768 } 1769 1770 // Set the SpoofChecker data from pre-built binary data in a byte buffer. 1771 // The binary data format is as described for ICU4C spoof data. 1772 // 1773 private void readData(ByteBuffer bytes) throws java.io.IOException { 1774 int magic = bytes.getInt(); 1775 if (magic != 0x3845fdef) { 1776 throw new IllegalArgumentException("Bad Spoof Check Data."); 1777 } 1778 @SuppressWarnings("unused") 1779 int dataFormatVersion = bytes.getInt(); 1780 @SuppressWarnings("unused") 1781 int dataLength = bytes.getInt(); 1782 1783 int CFUKeysOffset = bytes.getInt(); 1784 int CFUKeysSize = bytes.getInt(); 1785 1786 int CFUValuesOffset = bytes.getInt(); 1787 int CFUValuesSize = bytes.getInt(); 1788 1789 int CFUStringTableOffset = bytes.getInt(); 1790 int CFUStringTableSize = bytes.getInt(); 1791 1792 // We have now read the file header, and obtained the position for each 1793 // of the data items. Now read each in turn, first seeking the 1794 // input stream to the position of the data item. 1795 1796 bytes.reset(); 1797 ICUBinary.skipBytes(bytes, CFUKeysOffset); 1798 fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0); 1799 1800 bytes.reset(); 1801 ICUBinary.skipBytes(bytes, CFUValuesOffset); 1802 fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0); 1803 1804 bytes.reset(); 1805 ICUBinary.skipBytes(bytes, CFUStringTableOffset); 1806 fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0); 1807 } 1808 1809 /** 1810 * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be 1811 * appended will between 1 and 18 characters as of Unicode 9. 1812 * 1813 * This is the heart of the confusable skeleton generation implementation. 1814 */ 1815 public void confusableLookup(int inChar, StringBuilder dest) { 1816 // Perform a binary search. 1817 // [lo, hi), i.e lo is inclusive, hi is exclusive. 1818 // The result after the loop will be in lo. 1819 int lo = 0; 1820 int hi = length(); 1821 do { 1822 int mid = (lo + hi) / 2; 1823 if (codePointAt(mid) > inChar) { 1824 hi = mid; 1825 } else if (codePointAt(mid) < inChar) { 1826 lo = mid; 1827 } else { 1828 // Found result. Break early. 1829 lo = mid; 1830 break; 1831 } 1832 } while (hi - lo > 1); 1833 1834 // Did we find an entry? If not, the char maps to itself. 1835 if (codePointAt(lo) != inChar) { 1836 dest.appendCodePoint(inChar); 1837 return; 1838 } 1839 1840 // Add the element to the string builder and return. 1841 appendValueTo(lo, dest); 1842 return; 1843 } 1844 1845 /** 1846 * Return the number of confusable entries in this SpoofData. 1847 * 1848 * @return The number of entries. 1849 */ 1850 public int length() { 1851 return fCFUKeys.length; 1852 } 1853 1854 /** 1855 * Return the code point (key) at the specified index. 1856 * 1857 * @param index 1858 * The index within the SpoofData. 1859 * @return The code point. 1860 */ 1861 public int codePointAt(int index) { 1862 return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]); 1863 } 1864 1865 /** 1866 * Append the confusable skeleton at the specified index to the StringBuilder dest. 1867 * 1868 * @param index 1869 * The index within the SpoofData. 1870 * @param dest 1871 * The StringBuilder to which to append the skeleton. 1872 */ 1873 public void appendValueTo(int index, StringBuilder dest) { 1874 int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]); 1875 1876 // Value is either a char (for strings of length 1) or 1877 // an index into the string table (for longer strings) 1878 short value = fCFUValues[index]; 1879 if (stringLength == 1) { 1880 dest.append((char) value); 1881 } else { 1882 dest.append(fCFUStrings, value, value + stringLength); 1883 } 1884 } 1885 } 1886 1887 // ------------------------------------------------------------------------------- 1888 // 1889 // ScriptSet - Script code bit sets. 1890 // Extends Java BitSet with input/output support and a few helper methods. 1891 // Note: The I/O is not currently being used, so it has been commented out. If 1892 // it is needed again, the code can be restored. 1893 // 1894 // ------------------------------------------------------------------------------- 1895 static class ScriptSet extends BitSet { 1896 1897 // Eclipse default value to quell warnings: 1898 private static final long serialVersionUID = 1L; 1899 1900 // // The serialized version of this class can hold INT_CAPACITY * 32 scripts. 1901 // private static final int INT_CAPACITY = 6; 1902 // private static final long serialVersionUID = INT_CAPACITY; 1903 // static { 1904 // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT; 1905 // } 1906 // 1907 // public ScriptSet() { 1908 // } 1909 // 1910 // public ScriptSet(ByteBuffer bytes) throws java.io.IOException { 1911 // for (int i = 0; i < INT_CAPACITY; i++) { 1912 // int bits = bytes.getInt(); 1913 // for (int j = 0; j < Integer.SIZE; j++) { 1914 // if ((bits & (1 << j)) != 0) { 1915 // set(i * Integer.SIZE + j); 1916 // } 1917 // } 1918 // } 1919 // } 1920 // 1921 // public void output(DataOutputStream os) throws java.io.IOException { 1922 // for (int i = 0; i < INT_CAPACITY; i++) { 1923 // int bits = 0; 1924 // for (int j = 0; j < Integer.SIZE; j++) { 1925 // if (get(i * Integer.SIZE + j)) { 1926 // bits |= (1 << j); 1927 // } 1928 // } 1929 // os.writeInt(bits); 1930 // } 1931 // } 1932 1933 public void and(int script) { 1934 this.clear(0, script); 1935 this.clear(script + 1, UScript.CODE_LIMIT); 1936 } 1937 1938 public void setAll() { 1939 this.set(0, UScript.CODE_LIMIT); 1940 } 1941 1942 public boolean isFull() { 1943 return cardinality() == UScript.CODE_LIMIT; 1944 } 1945 1946 public void appendStringTo(StringBuilder sb) { 1947 sb.append("{ "); 1948 if (isEmpty()) { 1949 sb.append("- "); 1950 } else if (isFull()) { 1951 sb.append("* "); 1952 } else { 1953 for (int script = 0; script < UScript.CODE_LIMIT; script++) { 1954 if (get(script)) { 1955 sb.append(UScript.getShortName(script)); 1956 sb.append(" "); 1957 } 1958 } 1959 } 1960 sb.append("}"); 1961 } 1962 1963 @Override 1964 public String toString() { 1965 StringBuilder sb = new StringBuilder(); 1966 sb.append("<ScriptSet "); 1967 appendStringTo(sb); 1968 sb.append(">"); 1969 return sb.toString(); 1970 } 1971 } 1972} 1973