1/*
2 *******************************************************************************
3 * Copyright (C) 2009-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7package com.ibm.icu.dev.test.text;
8
9import java.io.BufferedReader;
10import java.io.IOException;
11import java.io.Reader;
12import java.io.StringReader;
13import java.text.ParseException;
14import java.util.Arrays;
15import java.util.BitSet;
16import java.util.Comparator;
17import java.util.HashSet;
18import java.util.LinkedHashSet;
19import java.util.Locale;
20import java.util.Random;
21import java.util.Set;
22import java.util.regex.Matcher;
23import java.util.regex.Pattern;
24
25import com.ibm.icu.dev.test.TestFmwk;
26import com.ibm.icu.dev.test.TestUtil;
27import com.ibm.icu.dev.test.TestUtil.JavaVendor;
28import com.ibm.icu.impl.Utility;
29import com.ibm.icu.lang.UScript;
30import com.ibm.icu.text.IdentifierInfo;
31import com.ibm.icu.text.Normalizer2;
32import com.ibm.icu.text.SpoofChecker;
33import com.ibm.icu.text.SpoofChecker.CheckResult;
34import com.ibm.icu.text.SpoofChecker.RestrictionLevel;
35import com.ibm.icu.text.UnicodeSet;
36import com.ibm.icu.util.ULocale;
37
38public class SpoofCheckerTest extends TestFmwk {
39
40    public static void main(String[] args) throws Exception {
41        new SpoofCheckerTest().run(args);
42    }
43
44    /*
45     * Identifiers for verifying that spoof checking is minimally alive and working.
46     */
47    char[] goodLatinChars = { (char) 0x75, (char) 0x7a };
48    String goodLatin = new String(goodLatinChars); /* "uz", all ASCII */
49    /* (not confusable) */
50    char[] scMixedChars = { (char) 0x73, (char) 0x0441 };
51    String scMixed = new String(scMixedChars); /* "sc", with Cyrillic 'c' */
52    /* (mixed script, confusable */
53
54    String scLatin = "sc";   /* "sc", plain ascii. */
55    String goodCyrl = "\u0438\u043B";    // "Cyrillic small letter i and el"  Plain lower case Cyrillic letters, no latin confusables
56    String goodGreek = "\u03c0\u03c6";   // "Greek small letter pi and phi"  Plain lower case Greek letters
57
58    // Various 1 l I look-alikes
59    String lll_Latin_a = "lI1";   // small letter l, cap I, digit 1, all ASCII
60    //  "\uFF29\u217C\u0196"  Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA
61    String lll_Latin_b = "\uff29\u217c\u0196";
62    String lll_Cyrl = "\u0406\u04C0\u0031";  // "\u0406\u04C01"
63    /* The skeleton transform for all of the 'lll' lookalikes is ascii lower case letter l. */
64    String lll_Skel = "lll";
65
66    String han_Hiragana = "\u3086\u308A \u77F3\u7530";  // Hiragana, space, Han
67
68
69    /*
70     * Test basic constructor.
71     */
72    public void TestUSpoof() {
73        SpoofChecker sc = new SpoofChecker.Builder().build();
74        if (sc == null) {
75            errln("FAIL: null SpoofChecker");
76        }
77    }
78
79    /*
80     * Test build from source rules.
81     */
82    public void TestOpenFromSourceRules() {
83        if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) {
84            // Note: IBM Java 5 has a bug reading a large UTF-8 text contents
85            logln("Skip this test case because of the IBM Java 5 bug");
86            return;
87        }
88        String fileName;
89        Reader confusables;
90        Reader confusablesWholeScript;
91
92        try {
93            SpoofChecker rsc = null;
94
95            fileName = "unicode/confusables.txt";
96            confusables = TestUtil.getDataReader(fileName, "UTF-8");
97            try {
98                fileName = "unicode/confusablesWholeScript.txt";
99                confusablesWholeScript = TestUtil.getDataReader(fileName, "UTF-8");
100                try {
101                    rsc = new SpoofChecker.Builder().setData(confusables, confusablesWholeScript).build();
102                } finally {
103                    confusablesWholeScript.close();
104                }
105            } finally {
106                confusables.close();
107            }
108
109            if (rsc == null) {
110                errln("FAIL: null SpoofChecker");
111                return;
112            }
113            // Check that newly built-from-rules SpoofChecker is able to function.
114            checkSkeleton(rsc, "TestOpenFromSourceRules");
115
116            SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
117            rsc.failsChecks("Hello", result);
118
119            // The checker we just built from source rules should be equivalent to the
120            //  default checker created from prebuilt rules baked into the ICU data.
121            SpoofChecker defaultChecker = new SpoofChecker.Builder().build();
122            assertTrue("Checker built from rules equals default", defaultChecker.equals(rsc));
123
124            SpoofChecker optionChecker = new SpoofChecker.Builder().
125                                    setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE).build();
126            assertFalse("", optionChecker.equals(rsc));
127
128            // Stub source data to build into a test SpoofChecker
129            String stubWSConfusables =
130                "# Stub Whole Script Confusable data\n" +
131                "0561          ; Armn; Cyrl; L #      (ա)  ARMENIAN SMALL LETTER AYB\n";
132
133            String stubConfusables =
134                "# Stub confusables data\n" +
135                "05AD ; 0596 ;  SL  # ( ֭ → ֖ ) HEBREW ACCENT DEHI → HEBREW ACCENT TIPEHA   #\n";
136
137            // Verify that re-using a builder doesn't alter SpoofCheckers that were
138            //  previously created by that builder. (The builder could modify data
139            //  being used by the existing checker)
140
141            SpoofChecker.Builder builder = new SpoofChecker.Builder();
142            SpoofChecker testChecker1 = builder.build();
143            assertTrue("", testChecker1.equals(defaultChecker));
144
145            builder.setData(new StringReader(stubConfusables), new StringReader(stubWSConfusables));
146            builder.setRestrictionLevel(RestrictionLevel.UNRESTRICTIVE);
147            builder.setChecks(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE);
148            Set<ULocale>allowedLocales = new HashSet<ULocale>();
149            allowedLocales.add(ULocale.JAPANESE);
150            allowedLocales.add(ULocale.FRENCH);
151            builder.setAllowedLocales(allowedLocales);
152            SpoofChecker testChecker2 = builder.build();
153            SpoofChecker testChecker3 = builder.build();
154
155            assertTrue("", testChecker1.equals(defaultChecker));
156            assertFalse("", testChecker2.equals(defaultChecker));
157            assertTrue("", testChecker2.equals(testChecker3));
158
159        } catch (java.io.IOException e) {
160            errln(e.toString());
161        } catch (ParseException e) {
162            errln(e.toString());
163        }
164    }
165
166    /*
167     * Set & Get Check Flags
168     */
169    public void TestGetSetChecks1() {
170        SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build();
171        int t;
172        t = sc.getChecks();
173        assertEquals("", SpoofChecker.ALL_CHECKS, t);
174
175        sc = new SpoofChecker.Builder().setChecks(0).build();
176        t = sc.getChecks();
177        assertEquals("", 0, t);
178
179        int checks = SpoofChecker.WHOLE_SCRIPT_CONFUSABLE | SpoofChecker.MIXED_SCRIPT_CONFUSABLE
180                | SpoofChecker.ANY_CASE;
181        sc = new SpoofChecker.Builder().setChecks(checks).build();
182        t = sc.getChecks();
183        assertEquals("", checks, t);
184    }
185
186    /*
187     * get & setAllowedChars
188     */
189    public void TestGetSetAllowedChars() {
190        SpoofChecker sc = new SpoofChecker.Builder().build();
191        UnicodeSet us;
192        UnicodeSet uset;
193
194        uset = sc.getAllowedChars();
195        assertTrue("", uset.isFrozen());
196        us = new UnicodeSet((int) 0x41, (int) 0x5A); /* [A-Z] */
197        sc = new SpoofChecker.Builder().setAllowedChars(us).build();
198        assertEquals("", us, sc.getAllowedChars());
199    }
200
201    /*
202     * get & set Checks
203     */
204    public void TestGetSetChecks() {
205        SpoofChecker sc = new SpoofChecker.Builder().build();
206        int checks;
207        int checks2;
208        boolean checkResults;
209
210        checks = sc.getChecks();
211        assertEquals("", SpoofChecker.ALL_CHECKS, checks);
212
213        checks &= ~(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.MIXED_SCRIPT_CONFUSABLE);
214        sc = new SpoofChecker.Builder().setChecks(checks).build();
215        checks2 = sc.getChecks();
216        assertEquals("", checks, checks2);
217
218        /*
219         * The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests
220         * gone checking that Identifier should now succeed
221         */
222        checkResults = sc.failsChecks(scMixed);
223        assertFalse("", checkResults);
224    }
225
226    /*
227     * AllowedLocales
228     */
229    public void TestAllowedLocales() {
230        SpoofChecker sc = new SpoofChecker.Builder().build();
231        Set<ULocale> allowedLocales = null;
232        Set<Locale> allowedJavaLocales = null;
233        boolean checkResults;
234
235        /* Default allowed locales list should be empty */
236        allowedLocales = sc.getAllowedLocales();
237        assertTrue("Empty allowed locales", allowedLocales.isEmpty());
238
239        allowedJavaLocales = sc.getAllowedJavaLocales();
240        assertTrue("Empty allowed Java locales", allowedJavaLocales.isEmpty());
241
242        /* Allow en and ru, which should enable Latin and Cyrillic only to pass */
243        ULocale enloc = new ULocale("en");
244        ULocale ruloc = new ULocale("ru_RU");
245        allowedLocales = new HashSet<ULocale>();
246        allowedLocales.add(enloc);
247        allowedLocales.add(ruloc);
248        sc = new SpoofChecker.Builder().setAllowedLocales(allowedLocales).build();
249        allowedLocales = sc.getAllowedLocales();
250        assertTrue("en in allowed locales", allowedLocales.contains(enloc));
251        assertTrue("ru_RU in allowed locales", allowedLocales.contains(ruloc));
252
253        Locale frlocJ = new Locale("fr");
254        allowedJavaLocales = new HashSet<Locale>();
255        allowedJavaLocales.add(frlocJ);
256        sc = new SpoofChecker.Builder().setAllowedJavaLocales(allowedJavaLocales).build();
257        assertFalse("no en in allowed Java locales", allowedJavaLocales.contains(new Locale("en")));
258        assertTrue("fr in allowed Java locales", allowedJavaLocales.contains(frlocJ));
259
260        /*
261         * Limit checks to SpoofChecker.CHAR_LIMIT. Some of the test data has whole script confusables also, which we
262         * don't want to see in this test.
263         */
264        sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
265
266        SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
267        checkResults = sc.failsChecks(goodLatin);
268        assertFalse("", checkResults);
269
270        checkResults = sc.failsChecks(goodGreek, result);
271        assertEquals("", SpoofChecker.CHAR_LIMIT, result.checks);
272
273        checkResults = sc.failsChecks(goodCyrl);
274        assertFalse("", checkResults);
275
276        /* Reset with an empty locale list, which should allow all characters to pass */
277        allowedLocales = new LinkedHashSet<ULocale>();
278        sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
279
280        checkResults = sc.failsChecks(goodGreek);
281        assertFalse("", checkResults);
282    }
283
284    /*
285     * AllowedChars set/get the UnicodeSet of allowed characters.
286     */
287    public void TestAllowedChars() {
288        SpoofChecker sc = new SpoofChecker.Builder().build();
289        UnicodeSet set;
290        UnicodeSet tmpSet;
291        boolean checkResults;
292
293        /* By default, we should see no restriction; the UnicodeSet should allow all characters. */
294        set = sc.getAllowedChars();
295        tmpSet = new UnicodeSet(0, 0x10ffff);
296        assertEquals("", tmpSet, set);
297
298        /* Setting the allowed chars should enable the check. */
299        sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CHAR_LIMIT).build();
300
301        /* Remove a character that is in our good Latin test identifier from the allowed chars set. */
302        tmpSet.remove(goodLatin.charAt(1));
303        sc = new SpoofChecker.Builder().setAllowedChars(tmpSet).build();
304
305        /* Latin Identifier should now fail; other non-latin test cases should still be OK */
306        SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
307        checkResults = sc.failsChecks(goodLatin, result);
308        assertTrue("", checkResults);
309        assertEquals("", SpoofChecker.CHAR_LIMIT | SpoofChecker.RESTRICTION_LEVEL, result.checks);
310
311        checkResults = sc.failsChecks(goodGreek, result);
312        assertTrue("", checkResults);
313        assertEquals("", SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, result.checks);
314    }
315
316    public void TestCheck() {
317        SpoofChecker sc = new SpoofChecker.Builder().build();
318        SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
319        boolean checkResults;
320
321        result.position = 666;
322        checkResults = sc.failsChecks(goodLatin, result);
323        assertFalse("", checkResults);
324        assertEquals("", 0, result.position);
325
326        checkResults = sc.failsChecks(goodCyrl, result);
327        assertFalse("", checkResults);
328
329        result.position = 666;
330        checkResults = sc.failsChecks(scMixed, result);
331        assertTrue("", checkResults);
332        assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks);
333        assertEquals("", 0, result.position);
334
335        result.position = 666;
336        checkResults = sc.failsChecks(han_Hiragana, result);
337        assertFalse("", checkResults);
338        assertEquals("", 0, result.position);
339        assertEquals("", 0, result.checks);
340    }
341
342    public void TestAreConfusable1() {
343        SpoofChecker sc = new SpoofChecker.Builder().build();
344        int checkResults;
345        checkResults = sc.areConfusable(scLatin, scMixed);
346        assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults);
347
348        checkResults = sc.areConfusable(goodGreek, scLatin);
349        assertEquals("", 0, checkResults);
350
351        checkResults = sc.areConfusable(lll_Latin_a, lll_Latin_b);
352        assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults);
353    }
354
355    public void TestGetSkeleton() {
356        SpoofChecker sc = new SpoofChecker.Builder().build();
357        String dest;
358        dest = sc.getSkeleton(SpoofChecker.ANY_CASE, lll_Latin_a);
359        assertEquals("", lll_Skel, dest);
360    }
361
362    /**
363     * IntlTestSpoof is the top level test class for the Unicode Spoof detection tests
364     */
365
366    // Test the USpoofDetector API functions that require C++
367    // The pure C part of the API, which is most of it, is tested in cintltst
368    /**
369     * IntlTestSpoof tests for USpoofDetector
370     */
371    public void TestSpoofAPI() {
372        SpoofChecker sc = new SpoofChecker.Builder().build();
373        String s = "xyz";  // Many latin ranges are whole-script confusable with other scripts.
374        // If this test starts failing, consult confusablesWholeScript.txt
375        SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
376        result.position = 666;
377        boolean checkResults = sc.failsChecks(s, result);
378        assertFalse("", checkResults);
379        assertEquals("", 0, result.position);
380
381        sc = new SpoofChecker.Builder().build();
382        String s1 = "cxs";
383        String s2 = Utility.unescape("\\u0441\\u0445\\u0455"); // Cyrillic "cxs"
384        int checkResult = sc.areConfusable(s1, s2);
385        assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, checkResult);
386
387        sc = new SpoofChecker.Builder().build();
388        s = "I1l0O";
389        String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s);
390        assertEquals("", dest, "lllOO");
391    }
392
393    public void TestSkeleton() {
394        SpoofChecker sc = new SpoofChecker.Builder().build();
395        checkSkeleton(sc, "TestSkeleton");
396    }
397
398    // testSkeleton. Spot check a number of confusable skeleton substitutions from the
399    // Unicode data file confusables.txt
400    // Test cases chosen for substitutions of various lengths, and
401    // membership in different mapping tables.
402    public void checkSkeleton(SpoofChecker sc, String testName) {
403        int ML = 0;
404        int SL = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE;
405        int MA = SpoofChecker.ANY_CASE;
406        int SA = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE;
407
408        checkSkeleton(sc, MA, "\\u02b9identifier'",  "'identifier'",  testName);
409
410        checkSkeleton(sc, SL, "nochange", "nochange", testName);
411        checkSkeleton(sc, SA, "nochange", "nochange", testName);
412        checkSkeleton(sc, ML, "nochange", "nochange", testName);
413        checkSkeleton(sc, MA, "nochange", "nochange", testName);
414        checkSkeleton(sc, MA, "love", "love", testName);
415        checkSkeleton(sc, MA, "1ove", "love", testName);   // Digit 1 to letter l
416        checkSkeleton(sc, ML, "OOPS", "OOPS", testName);
417        checkSkeleton(sc, ML, "00PS", "OOPS", testName);
418        checkSkeleton(sc, MA, "OOPS", "OOPS", testName);
419        checkSkeleton(sc, MA, "00PS", "OOPS", testName);   // Digit 0 to letter O
420        checkSkeleton(sc, SL, "\\u059c", "\\u0301", testName);
421        checkSkeleton(sc, SL, "\\u2A74", "\\u003A\\u003A\\u003D", testName);
422        checkSkeleton(sc, SL, "\\u247E", "(ll)", testName);
423        checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u006c\\u0644\\u006f", testName);
424
425        // 0C83 mapping existed in the ML and MA tables, did not exist in SL, SA (Original Unicode 7)
426        //   mapping exists in all tables (ICU 55).
427        // 0C83 ; 0983 ; ML #  KANNADA SIGN VISARGA to
428        checkSkeleton(sc, SL, "\\u0C83", "\\u0983", testName);
429        checkSkeleton(sc, SA, "\\u0C83", "\\u0983", testName);
430        checkSkeleton(sc, ML, "\\u0C83", "\\u0983", testName);
431        checkSkeleton(sc, MA, "\\u0C83", "\\u0983", testName);
432
433        // 0391 mappings existed only in MA and SA tables (Original Unicode 7).
434        //      mappings exist in all tables (ICU 55)
435        checkSkeleton(sc, MA, "\\u0391", "A", testName);
436        checkSkeleton(sc, SA, "\\u0391", "A", testName);
437        checkSkeleton(sc, ML, "\\u0391", "A", testName);
438        checkSkeleton(sc, SL, "\\u0391", "A", testName);
439
440        // 13CF Mappings in all four tables, different in MA (Original Unicode 7).
441        //      Mapping same in all tables (ICU 55)
442        checkSkeleton(sc, ML, "\\u13CF", "b", testName);
443        checkSkeleton(sc, MA, "\\u13CF", "b", testName);
444        checkSkeleton(sc, SL, "\\u13CF", "b", testName);
445        checkSkeleton(sc, SA, "\\u13CF", "b", testName);
446
447        // 0022 ; 0027 0027 ;
448        // all tables
449        checkSkeleton(sc, SL, "\"", "\\u0027\\u0027", testName);
450        checkSkeleton(sc, SA, "\"", "\\u0027\\u0027", testName);
451        checkSkeleton(sc, ML, "\"", "\\u0027\\u0027", testName);
452        checkSkeleton(sc, MA, "\"", "\\u0027\\u0027", testName);
453
454    }
455
456    // Internal function to run a single skeleton test case.
457    //
458    // Run a single confusable skeleton transformation test case.
459    //
460    void checkSkeleton(SpoofChecker sc, int type, String input, String expected, String testName) {
461        String uInput = Utility.unescape(input);
462        String uExpected = Utility.unescape(expected);
463        String actual;
464        actual = sc.getSkeleton(type, uInput);
465        Throwable t = new Throwable();
466        int lineNumberOfTest = t.getStackTrace()[1].getLineNumber();
467
468        assertEquals(testName + " test at line " + lineNumberOfTest + " :  Expected (escaped): " + expected, uExpected, actual);
469    }
470
471    public void TestAreConfusable() {
472        SpoofChecker sc = new SpoofChecker.Builder().build();
473        String s1 = "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
474                + "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ";
475        String s2 = "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
476                + "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ";
477        assertEquals("", SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2));
478    }
479
480    public void TestInvisible() {
481        SpoofChecker sc = new SpoofChecker.Builder().build();
482        String s = Utility.unescape("abcd\\u0301ef");
483        SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
484        result.position = -42;
485        assertFalse("", sc.failsChecks(s, result));
486        assertEquals("", 0, result.checks);
487        assertEquals("", result.position, 0);
488
489        String s2 = Utility.unescape("abcd\\u0301\\u0302\\u0301ef");
490        assertTrue("", sc.failsChecks(s2, result));
491        assertEquals("", SpoofChecker.INVISIBLE, result.checks);
492        assertEquals("", 0, result.position);
493
494        // Two acute accents, one from the composed a with acute accent, \u00e1,
495        // and one separate.
496        result.position = -42;
497        String s3 = Utility.unescape("abcd\\u00e1\\u0301xyz");
498        assertTrue("", sc.failsChecks(s3, result));
499        assertEquals("", SpoofChecker.INVISIBLE, result.checks);
500        assertEquals("", 0, result.position);
501    }
502
503    public void TestRestrictionLevel() {
504        Object[][] tests = {
505                {"aγ♥", RestrictionLevel.UNRESTRICTIVE},
506                {"a", RestrictionLevel.ASCII},
507                {"γ", RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE},
508                {"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE},
509                {"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE},
510                {"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE},
511        };
512        IdentifierInfo idInfo = new IdentifierInfo().setIdentifierProfile(SpoofChecker.RECOMMENDED);
513        CheckResult checkResult = new CheckResult();
514        for (Object[] test : tests) {
515            String testString = (String) test[0];
516            RestrictionLevel expectedLevel = (RestrictionLevel) test[1];
517            idInfo.setIdentifier(testString);
518            assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel());
519            for (RestrictionLevel levelSetInSpoofChecker : RestrictionLevel.values()) {
520                SpoofChecker sc = new SpoofChecker.Builder()
521                .setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this
522                .setAllowedChars(SpoofChecker.RECOMMENDED)
523                .setRestrictionLevel(levelSetInSpoofChecker)
524                .build();
525                boolean actualValue = sc.failsChecks(testString, checkResult);
526
527                // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
528                boolean expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0 || !SpoofChecker.RECOMMENDED.containsAll(testString);
529                boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + levelSetInSpoofChecker, expectedFailure, actualValue);
530                if (!t) { // debugging
531                    actualValue = sc.failsChecks(testString, checkResult);
532                    // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
533                    expectedFailure = expectedLevel.compareTo(levelSetInSpoofChecker) > 0 || !SpoofChecker.RECOMMENDED.containsAll(testString);
534                }
535            }
536        }
537    }
538
539    public void TestMixedNumbers() {
540        Object[][] tests = {
541                {"1", "[0]"},
542                {"१", "[०]"},
543                {"1१", "[0०]"},
544                {"١۱", "[٠۰]"},
545        };
546        IdentifierInfo idInfo = new IdentifierInfo();
547        CheckResult checkResult = new CheckResult();
548        for (Object[] test : tests) {
549            String testString = (String) test[0];
550            UnicodeSet expected = new UnicodeSet((String)test[1]);
551            idInfo.setIdentifier(testString);
552            assertEquals("", expected, idInfo.getNumerics());
553
554            SpoofChecker sc = new SpoofChecker.Builder()
555            .setChecks(SpoofChecker.MIXED_NUMBERS) // only check this
556            .build();
557            boolean actualValue = sc.failsChecks(testString, checkResult);
558            assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue);
559        }
560    }
561
562    public void TestIdentifierInfo() {
563//        contains(BitSet, BitSet)
564        BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL);
565        BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL);
566        assertTrue("", IdentifierInfo.contains(bitset12, bitset2));
567        assertTrue("", IdentifierInfo.contains(bitset12, bitset12));
568        assertTrue("", !IdentifierInfo.contains(bitset2, bitset12));
569
570        assertTrue("", IdentifierInfo.BITSET_COMPARATOR.compare(
571                IdentifierInfo.set(new BitSet(), UScript.ARABIC),
572                IdentifierInfo.set(new BitSet(), UScript.LATIN)) < 0);
573//      displayAlternates(Collection<BitSet>)
574//      displayScripts(BitSet)
575        String scriptString = IdentifierInfo.displayScripts(bitset12);
576        assertEquals("", "Hang Latn", scriptString);
577        Set<BitSet> alternates = new HashSet(Arrays.asList(bitset12, bitset2));
578        String alternatesString = IdentifierInfo.displayAlternates(alternates);
579        assertEquals("", "Hang; Hang Latn", alternatesString);
580
581//        parseAlternates(String)
582//        parseScripts(String)
583        assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString));
584        assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString));
585
586        String[][] tests = {
587                // String, restriction-level, numerics, scripts, alternates, common-alternates
588                {"a♥",  "UNRESTRICTIVE", "[]", "Latn", "", ""},
589                {"a〆",  "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hani Hira Kana", "Hani Hira Kana"},
590                {"aー〆",  "HIGHLY_RESTRICTIVE", "[]", "Latn", "Hira Kana", "Hira Kana"},
591                {"aー〆ア",  "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
592                {"アaー〆",  "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
593                {"a1١",  "UNRESTRICTIVE", "[0٠]", "Latn", "Arab Thaa", "Arab Thaa"},
594                {"a1١۱",  "UNRESTRICTIVE", "[0٠۰]", "Latn Arab", "", ""},
595                {"١ー〆aア1१۱",  "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
596                {"aアー〆1१١۱",  "UNRESTRICTIVE", "[0٠۰०]", "Latn Kana Arab", "Deva Kthi Mahj", "Deva Kthi Mahj"},
597        };
598        for (String[] test : tests) {
599            String testString = test[0];
600            IdentifierInfo idInfo = new IdentifierInfo();
601            idInfo.setIdentifierProfile(SpoofChecker.RECOMMENDED);
602            idInfo.setIdentifier(testString);
603            assertEquals("Identifier " + testString, testString, idInfo.getIdentifier());
604
605            RestrictionLevel restrictionLevel = RestrictionLevel.valueOf(test[1]);
606            assertEquals("RestrictionLevel " + testString, restrictionLevel, idInfo.getRestrictionLevel());
607
608            UnicodeSet numerics = new UnicodeSet(test[2]);
609            assertEquals("Numerics " + testString, numerics, idInfo.getNumerics());
610
611            BitSet scripts = IdentifierInfo.parseScripts(test[3]);
612            assertEquals("Scripts " + testString, scripts, idInfo.getScripts());
613
614            Set<BitSet> alternates2 = IdentifierInfo.parseAlternates(test[4]);
615            assertEquals("Alternates " + testString, alternates2, idInfo.getAlternates());
616
617            BitSet commonAlternates = IdentifierInfo.parseScripts(test[5]);
618            assertEquals("Common Alternates " + testString, commonAlternates, idInfo.getCommonAmongAlternates());
619        }
620
621// TODO
622//        getIdentifierProfile()
623//        setIdentifierProfile(UnicodeSet)
624    }
625
626    public void TestComparator() {
627        Random random = new Random(0);
628        for (int i = 0; i < 100; ++i) {
629            BitSet[] items = new BitSet[random.nextInt(5)+3];
630            for (int j = 0; j < items.length; ++j) {
631                items[j] = new BitSet();
632                int countInBitset = random.nextInt(5);
633                for (int k = 0; k < countInBitset; ++k) {
634                    items[j].set(random.nextInt(10));
635                }
636            }
637            checkComparator(IdentifierInfo.BITSET_COMPARATOR, items);
638        }
639    }
640
641    // Dumb implementation for now
642    private <T> void checkComparator(Comparator<T> comparator, T... items) {
643        logln("Checking " + Arrays.asList(items));
644        /*
645         * The relation is transitive: a < b and b < c implies a < c. We test here.
646         * The relation is trichotomous: exactly one of a <  b, b < a and a = b is true. Guaranteed by comparator.
647         */
648        for (int i = 0; i < items.length-2; ++i) {
649            T a = items[i];
650            for (int j = i+1; j < items.length-1; ++j) {
651                T b = items[j];
652                for (int k = j+1; k < items.length; ++k) {
653                    T c = items[k];
654                    checkTransitivity(comparator, a, b, c);
655                    checkTransitivity(comparator, a, c, b);
656                    checkTransitivity(comparator, b, a, b);
657                    checkTransitivity(comparator, b, c, a);
658                    checkTransitivity(comparator, c, a, b);
659                    checkTransitivity(comparator, c, b, a);
660                }
661            }
662        }
663    }
664
665    private <T> void checkTransitivity(Comparator<T> comparator, T a, T b, T c) {
666        int ab = comparator.compare(a,b);
667        int bc = comparator.compare(b,c);
668        int ca = comparator.compare(c,a);
669        if (!assertFalse("Transitive: " + a + ", " + b + ", " + c,
670                ab < 0 && bc < 0 && ca <= 0)) {
671            // for debugging
672            comparator.compare(a,b);
673            comparator.compare(b,c);
674            comparator.compare(c,a);
675            assertFalse("Transitive: " + a + ", " + b + ", " + c,
676                    ab < 0 && bc < 0 && ca <= 0);
677        }
678    }
679
680    private String parseHex(String in) {
681        StringBuilder sb = new StringBuilder();
682        for (String oneCharAsHexString : in.split("\\s+")) {
683            if (oneCharAsHexString.length() > 0) {
684                sb.appendCodePoint(Integer.parseInt(oneCharAsHexString, 16));
685            }
686        }
687        return sb.toString();
688    }
689
690    private String escapeString(String in) {
691        StringBuilder out = new StringBuilder();
692        for (int i = 0; i < in.length(); i++) {
693            int c = in.codePointAt(i);
694            if (c <= 0x7f) {
695                out.append((char) c);
696            } else if (c <= 0xffff) {
697                out.append(String.format("\\u%04x", c));
698            } else {
699                out.append(String.format("\\U%06x", c));
700                i++;
701            }
702        }
703        return out.toString();
704    }
705
706    // Verify that each item from the Unicode confusables.txt file
707    // transforms into the expected skeleton.
708    public void testConfData() {
709        if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) {
710            // Note: IBM Java 5 has a bug reading a large UTF-8 text contents
711            logln("Skip this test case because of the IBM Java 5 bug");
712            return;
713        }
714        try {
715            // Read in the confusables.txt file. (Distributed by Unicode.org)
716            String fileName = "unicode/confusables.txt";
717            BufferedReader confusablesRdr = TestUtil.getDataReader(fileName, "UTF-8");
718
719            // Create a default spoof checker to use in this test.
720            SpoofChecker sc = new SpoofChecker.Builder().build();
721
722            // Parse lines from the confusables.txt file. Example Line:
723            // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
724            // Lines have three fields. The hex fields can contain more than one character,
725            // and each character may be more than 4 digits (for supplemntals)
726            // This regular expression matches lines and splits the fields into capture groups.
727            // Capture group 1: map from chars
728            // 2: map to chars
729            // 3: table type, SL, ML, SA or MA
730            // 4: Comment Lines Only
731            // 5: Error Lines Only
732            Matcher parseLine = Pattern.compile(
733                    "\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
734                            + "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
735                            matcher("");
736            Normalizer2 normalizer = Normalizer2.getNFDInstance();
737            int lineNum = 0;
738            String inputLine;
739            while ((inputLine = confusablesRdr.readLine()) != null) {
740                lineNum++;
741                parseLine.reset(inputLine);
742                if (!parseLine.matches()) {
743                    errln("Syntax error in confusable data file at line " + lineNum);
744                    errln(inputLine);
745                    break;
746                }
747                if (parseLine.group(4) != null) {
748                    continue; // comment line
749                }
750                String from = parseHex(parseLine.group(1));
751
752                if (!normalizer.isNormalized(from)) {
753                    // The source character was not NFD.
754                    // Skip this case; the first step in obtaining a skeleton is to NFD the input,
755                    // so the mapping in this line of confusables.txt will never be applied.
756                    continue;
757                }
758
759                String rawExpected = parseHex(parseLine.group(2));
760                String expected = normalizer.normalize(rawExpected);
761
762                int skeletonType = 0;
763                String tableType = parseLine.group(3);
764                if (tableType.equals("SL")) {
765                    skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE;
766                } else if (tableType.indexOf("SA") >= 0) {
767                    skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE;
768                } else if (tableType.indexOf("ML") >= 0) {
769                    skeletonType = 0;
770                } else if (tableType.indexOf("MA") >= 0) {
771                    skeletonType = SpoofChecker.ANY_CASE;
772                }
773
774                String actual;
775                actual = sc.getSkeleton(skeletonType, from);
776
777                if (!actual.equals(expected)) {
778                    errln("confusables.txt: " + lineNum + ": " + parseLine.group(0));
779                    errln("Actual: " + escapeString(actual));
780                }
781            }
782            confusablesRdr.close();
783        } catch (IOException e) {
784            errln(e.toString());
785        }
786    }
787}
788