1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2009-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/********************************************************************************
7*
8* File spooftest.c
9*
10*********************************************************************************/
11/*C API TEST for the uspoof Unicode Indentifier Spoofing and Security API */
12/**
13*   This is an API test for ICU spoof detection in plain C.  It doesn't test very many cases, and doesn't
14*   try to test the full functionality.  It just calls each function and verifies that it
15*   works on a basic level.
16*
17*   More complete testing of spoof detection functionality is done with the C++ tests.
18**/
19
20#include "unicode/utypes.h"
21#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION
22
23#include <stdlib.h>
24#include <stdio.h>
25#include <string.h>
26#include "unicode/uspoof.h"
27#include "unicode/ustring.h"
28#include "unicode/uset.h"
29#include "cintltst.h"
30
31#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
32    log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));}}
33
34#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
35log_err("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
36
37#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
38    log_err("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
39             __FILE__, __LINE__, #a, (a), #b, (b)); }}
40
41#define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
42    log_err("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
43             __FILE__, __LINE__, #a, (a), #b, (b)); }}
44
45
46/*
47 *   TEST_SETUP and TEST_TEARDOWN
48 *         macros to handle the boilerplate around setting up test case.
49 *         Put arbitrary test code between SETUP and TEARDOWN.
50 *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
51 */
52#define TEST_SETUP {  \
53    UErrorCode status = U_ZERO_ERROR; \
54    USpoofChecker *sc;     \
55    sc = uspoof_open(&status);  \
56    TEST_ASSERT_SUCCESS(status);   \
57    if (U_SUCCESS(status)){
58
59#define TEST_TEARDOWN  \
60    }  \
61    TEST_ASSERT_SUCCESS(status);  \
62    uspoof_close(sc);  \
63}
64
65
66static void TestUSpoofCAPI(void);
67
68void addUSpoofTest(TestNode** root);
69
70void addUSpoofTest(TestNode** root)
71{
72#if !UCONFIG_NO_FILE_IO
73    addTest(root, &TestUSpoofCAPI, "uspoof/TestUSpoofCAPI");
74#endif
75}
76
77/*
78 *  Identifiers for verifying that spoof checking is minimally alive and working.
79 */
80const UChar goodLatin[] = {(UChar)0x75, (UChar)0x7a, 0};    /* "uz", all ASCII             */
81                                                            /*   (not confusable)          */
82const UChar scMixed[]  = {(UChar)0x73, (UChar)0x0441, 0};   /* "sc", with Cyrillic 'c'     */
83                                                            /*   (mixed script, confusable */
84
85const UChar scLatin[]  = {(UChar)0x73,  (UChar)0x63, 0};    /* "sc", plain ascii.        */
86const UChar goodCyrl[] = {(UChar)0x438, (UChar)0x43B, 0};   /* Plain lower case Cyrillic letters,
87                                                               no latin confusables         */
88
89const UChar goodGreek[]   = {(UChar)0x3c0, (UChar)0x3c6, 0};   /* Plain lower case Greek letters */
90
91const UChar lll_Latin_a[] = {(UChar)0x6c, (UChar)0x49, (UChar)0x31, 0};   /* lI1, all ASCII */
92
93                             /*  Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA*/
94const UChar lll_Latin_b[] = {(UChar)0xff29, (UChar)0x217c, (UChar)0x196, 0};
95
96const UChar lll_Cyrl[]    = {(UChar)0x0406, (UChar)0x04C0, (UChar)0x31, 0};
97
98/* The skeleton transform for all of thes 'lll' lookalikes is all lower case l. */
99const UChar lll_Skel[]    = {(UChar)0x6c, (UChar)0x6c, (UChar)0x6c, 0};
100
101const UChar han_Hiragana[] = {(UChar)0x3086, (UChar)0x308A, (UChar)0x0020, (UChar)0x77F3, (UChar)0x7530, 0};
102
103/* Provide better code coverage */
104const char goodLatinUTF8[]    = {0x75, 0x77, 0};
105/*
106 *   Spoof Detction C API Tests
107 */
108static void TestUSpoofCAPI(void) {
109
110    /*
111     *  basic uspoof_open().
112     */
113    {
114        USpoofChecker *sc;
115        UErrorCode  status = U_ZERO_ERROR;
116        sc = uspoof_open(&status);
117        TEST_ASSERT_SUCCESS(status);
118        if (U_FAILURE(status)) {
119            /* If things are so broken that we can't even open a default spoof checker,  */
120            /*   don't even try the rest of the tests.  They would all fail.             */
121            return;
122        }
123        uspoof_close(sc);
124    }
125
126
127
128    /*
129     *  Test Open from source rules.
130    */
131    TEST_SETUP
132    const char *dataSrcDir;
133    char       *fileName;
134    char       *confusables;
135    int         confusablesLength = 0;
136    char       *confusablesWholeScript;
137    int         confusablesWholeScriptLength = 0;
138    FILE       *f;
139    UParseError pe;
140    int32_t     errType;
141    USpoofChecker *rsc;
142
143    dataSrcDir = ctest_dataSrcDir();
144    fileName = malloc(strlen(dataSrcDir) + 100);
145    strcpy(fileName, dataSrcDir);
146    strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusables.txt");
147    f = fopen(fileName, "rb");
148    TEST_ASSERT_NE(f, NULL);
149    confusables = malloc(3000000);
150    if (f != NULL) {
151        confusablesLength = fread(confusables, 1, 3000000, f);
152        fclose(f);
153    }
154
155    strcpy(fileName, dataSrcDir);
156    strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusablesWholeScript.txt");
157    f = fopen(fileName, "rb");
158    TEST_ASSERT_NE(f, NULL);
159    confusablesWholeScript = malloc(1000000);
160    if (f != NULL) {
161        confusablesWholeScriptLength = fread(confusablesWholeScript, 1, 1000000, f);
162        fclose(f);
163    }
164
165    rsc = uspoof_openFromSource(confusables, confusablesLength,
166                                              confusablesWholeScript, confusablesWholeScriptLength,
167                                              &errType, &pe, &status);
168    TEST_ASSERT_SUCCESS(status);
169
170    free(confusablesWholeScript);
171    free(confusables);
172    free(fileName);
173    uspoof_close(rsc);
174    /*  printf("ParseError Line is %d\n", pe.line);  */
175    TEST_TEARDOWN;
176
177
178    /*
179     * openFromSerialized and serialize
180    */
181    TEST_SETUP
182        int32_t        serializedSize = 0;
183        int32_t        actualLength = 0;
184        char           *buf;
185        USpoofChecker  *sc2;
186        int32_t         checkResults;
187
188
189        serializedSize = uspoof_serialize(sc, NULL, 0, &status);
190        TEST_ASSERT_EQ(status, U_BUFFER_OVERFLOW_ERROR);
191        TEST_ASSERT(serializedSize > 0);
192
193        /* Serialize the default spoof checker */
194        status = U_ZERO_ERROR;
195        buf = (char *)malloc(serializedSize + 10);
196        TEST_ASSERT(buf != NULL);
197        buf[serializedSize] = 42;
198        uspoof_serialize(sc, buf, serializedSize, &status);
199        TEST_ASSERT_SUCCESS(status);
200        TEST_ASSERT_EQ(42, buf[serializedSize]);
201
202        /* Create a new spoof checker from the freshly serialized data */
203        sc2 = uspoof_openFromSerialized(buf, serializedSize+10, &actualLength, &status);
204        TEST_ASSERT_SUCCESS(status);
205        TEST_ASSERT_NE(NULL, sc2);
206        TEST_ASSERT_EQ(serializedSize, actualLength);
207
208        /* Verify that the new spoof checker at least wiggles */
209        checkResults = uspoof_check(sc2, goodLatin, -1, NULL, &status);
210        TEST_ASSERT_SUCCESS(status);
211        TEST_ASSERT_EQ(0, checkResults);
212
213        checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status);
214        TEST_ASSERT_SUCCESS(status);
215        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
216
217        uspoof_close(sc2);
218        free(buf);
219    TEST_TEARDOWN;
220
221
222
223    /*
224     * Set & Get Check Flags
225    */
226    TEST_SETUP
227        int32_t t;
228        uspoof_setChecks(sc, USPOOF_ALL_CHECKS, &status);
229        TEST_ASSERT_SUCCESS(status);
230        t = uspoof_getChecks(sc, &status);
231        TEST_ASSERT_EQ(t, USPOOF_ALL_CHECKS);
232
233        uspoof_setChecks(sc, 0, &status);
234        TEST_ASSERT_SUCCESS(status);
235        t = uspoof_getChecks(sc, &status);
236        TEST_ASSERT_EQ(0, t);
237
238        uspoof_setChecks(sc,
239                        USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE,
240                        &status);
241        TEST_ASSERT_SUCCESS(status);
242        t = uspoof_getChecks(sc, &status);
243        TEST_ASSERT_SUCCESS(status);
244        TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, t);
245    TEST_TEARDOWN;
246
247    /*
248    * get & setAllowedChars
249    */
250    TEST_SETUP
251        USet *us;
252        const USet *uset;
253
254        uset = uspoof_getAllowedChars(sc, &status);
255        TEST_ASSERT_SUCCESS(status);
256        TEST_ASSERT(uset_isFrozen(uset));
257        us = uset_open((UChar32)0x41, (UChar32)0x5A);   /*  [A-Z]  */
258        uspoof_setAllowedChars(sc, us, &status);
259        TEST_ASSERT_SUCCESS(status);
260        TEST_ASSERT_NE(us, uspoof_getAllowedChars(sc, &status));
261        TEST_ASSERT(uset_equals(us, uspoof_getAllowedChars(sc, &status)));
262        TEST_ASSERT_SUCCESS(status);
263        uset_close(us);
264    TEST_TEARDOWN;
265
266    /*
267    *  clone()
268    */
269
270    TEST_SETUP
271        USpoofChecker *clone1 = NULL;
272        USpoofChecker *clone2 = NULL;
273        int32_t        checkResults = 0;
274
275        clone1 = uspoof_clone(sc, &status);
276        TEST_ASSERT_SUCCESS(status);
277        TEST_ASSERT_NE(clone1, sc);
278
279        clone2 = uspoof_clone(clone1, &status);
280        TEST_ASSERT_SUCCESS(status);
281        TEST_ASSERT_NE(clone2, clone1);
282
283        uspoof_close(clone1);
284
285        /* Verify that the cloned spoof checker is alive */
286        checkResults = uspoof_check(clone2, goodLatin, -1, NULL, &status);
287        TEST_ASSERT_SUCCESS(status);
288        TEST_ASSERT_EQ(0, checkResults);
289
290        checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status);
291        TEST_ASSERT_SUCCESS(status);
292        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
293        uspoof_close(clone2);
294    TEST_TEARDOWN;
295
296     /*
297     *  basic uspoof_check()
298     */
299     TEST_SETUP
300         int32_t result;
301         result = uspoof_check(sc, goodLatin, -1, NULL, &status);
302         TEST_ASSERT_SUCCESS(status);
303         TEST_ASSERT_EQ(0, result);
304
305         result = uspoof_check(sc, han_Hiragana, -1, NULL, &status);
306         TEST_ASSERT_SUCCESS(status);
307         TEST_ASSERT_EQ(0, result);
308
309         result = uspoof_check(sc, scMixed, -1, NULL, &status);
310         TEST_ASSERT_SUCCESS(status);
311         TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, result);
312     TEST_TEARDOWN
313
314
315    /*
316     *  get & set Checks
317    */
318    TEST_SETUP
319        int32_t   checks;
320        int32_t   checks2;
321        int32_t   checkResults;
322
323        checks = uspoof_getChecks(sc, &status);
324        TEST_ASSERT_SUCCESS(status);
325        TEST_ASSERT_EQ(USPOOF_ALL_CHECKS, checks);
326
327        checks &= ~(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE);
328        uspoof_setChecks(sc, checks, &status);
329        TEST_ASSERT_SUCCESS(status);
330        checks2 = uspoof_getChecks(sc, &status);
331        TEST_ASSERT_EQ(checks, checks2);
332
333        /* The checks that were disabled just above are the same ones that the "scMixed" test fails.
334            So with those tests gone checking that Identifier should now succeed */
335        checkResults = uspoof_check(sc, scMixed, -1, NULL, &status);
336        TEST_ASSERT_SUCCESS(status);
337        TEST_ASSERT_EQ(0, checkResults);
338    TEST_TEARDOWN;
339
340    /*
341     *  AllowedLoacles
342     */
343
344    TEST_SETUP
345        const char  *allowedLocales;
346        int32_t  checkResults;
347
348        /* Default allowed locales list should be empty */
349        allowedLocales = uspoof_getAllowedLocales(sc, &status);
350        TEST_ASSERT_SUCCESS(status);
351        TEST_ASSERT(strcmp("", allowedLocales) == 0)
352
353        /* Allow en and ru, which should enable Latin and Cyrillic only to pass */
354        uspoof_setAllowedLocales(sc, "en, ru_RU", &status);
355        TEST_ASSERT_SUCCESS(status);
356        allowedLocales = uspoof_getAllowedLocales(sc, &status);
357        TEST_ASSERT_SUCCESS(status);
358        TEST_ASSERT(strstr(allowedLocales, "en") != NULL);
359        TEST_ASSERT(strstr(allowedLocales, "ru") != NULL);
360
361        /* Limit checks to USPOOF_CHAR_LIMIT.  Some of the test data has whole script confusables also,
362         * which we don't want to see in this test. */
363        uspoof_setChecks(sc, USPOOF_CHAR_LIMIT, &status);
364        TEST_ASSERT_SUCCESS(status);
365
366        checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
367        TEST_ASSERT_SUCCESS(status);
368        TEST_ASSERT_EQ(0, checkResults);
369
370        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
371        TEST_ASSERT_SUCCESS(status);
372        TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);
373
374        checkResults = uspoof_check(sc, goodCyrl, -1, NULL, &status);
375        TEST_ASSERT_SUCCESS(status);
376        TEST_ASSERT_EQ(0, checkResults);
377
378        /* Reset with an empty locale list, which should allow all characters to pass */
379        uspoof_setAllowedLocales(sc, " ", &status);
380        TEST_ASSERT_SUCCESS(status);
381
382        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
383        TEST_ASSERT_SUCCESS(status);
384        TEST_ASSERT_EQ(0, checkResults);
385    TEST_TEARDOWN;
386
387    /*
388     * AllowedChars   set/get the USet of allowed characters.
389     */
390    TEST_SETUP
391        const USet  *set;
392        USet        *tmpSet;
393        int32_t      checkResults;
394
395        /* By default, we should see no restriction; the USet should allow all characters. */
396        set = uspoof_getAllowedChars(sc, &status);
397        TEST_ASSERT_SUCCESS(status);
398        tmpSet = uset_open(0, 0x10ffff);
399        TEST_ASSERT(uset_equals(tmpSet, set));
400
401        /* Setting the allowed chars should enable the check. */
402        uspoof_setChecks(sc, USPOOF_ALL_CHECKS & ~USPOOF_CHAR_LIMIT, &status);
403        TEST_ASSERT_SUCCESS(status);
404
405        /* Remove a character that is in our good Latin test identifier from the allowed chars set. */
406        uset_remove(tmpSet, goodLatin[1]);
407        uspoof_setAllowedChars(sc, tmpSet, &status);
408        TEST_ASSERT_SUCCESS(status);
409        uset_close(tmpSet);
410
411        /* Latin Identifier should now fail; other non-latin test cases should still be OK
412         *  Note: fail of CHAR_LIMIT also causes the restriction level to be USPOOF_UNRESTRICTIVE
413         *        which will give us a USPOOF_RESTRICTION_LEVEL failure.
414         */
415        checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
416        TEST_ASSERT_SUCCESS(status);
417        TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT | USPOOF_RESTRICTION_LEVEL, checkResults);
418
419        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
420        TEST_ASSERT_SUCCESS(status);
421        TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
422    TEST_TEARDOWN;
423
424    /*
425     * check UTF-8
426     */
427    TEST_SETUP
428        char    utf8buf[200];
429        int32_t checkResults;
430        int32_t position;
431
432        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
433        TEST_ASSERT_SUCCESS(status);
434        position = 666;
435        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
436        TEST_ASSERT_SUCCESS(status);
437        TEST_ASSERT_EQ(0, checkResults);
438        TEST_ASSERT_EQ(0, position);
439
440        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status);
441        TEST_ASSERT_SUCCESS(status);
442        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
443        TEST_ASSERT_SUCCESS(status);
444        TEST_ASSERT_EQ(0, checkResults);
445
446        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, scMixed, -1, &status);
447        TEST_ASSERT_SUCCESS(status);
448        position = 666;
449        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
450        TEST_ASSERT_SUCCESS(status);
451        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
452        TEST_ASSERT_EQ(0, position);
453
454    TEST_TEARDOWN;
455
456    /*
457     * uspoof_areConfusable()
458     */
459    TEST_SETUP
460        int32_t  checkResults;
461
462        checkResults = uspoof_areConfusable(sc, scLatin, -1, scMixed, -1, &status);
463        TEST_ASSERT_SUCCESS(status);
464        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
465
466        checkResults = uspoof_areConfusable(sc, goodGreek, -1, scLatin, -1, &status);
467        TEST_ASSERT_SUCCESS(status);
468        TEST_ASSERT_EQ(0, checkResults);
469
470        checkResults = uspoof_areConfusable(sc, lll_Latin_a, -1, lll_Latin_b, -1, &status);
471        TEST_ASSERT_SUCCESS(status);
472        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
473
474    TEST_TEARDOWN;
475
476    /*
477     * areConfusableUTF8
478     */
479    TEST_SETUP
480        int32_t checkResults;
481        char s1[200];
482        char s2[200];
483
484
485        u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status);
486        u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status);
487        TEST_ASSERT_SUCCESS(status);
488        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
489        TEST_ASSERT_SUCCESS(status);
490        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
491
492        u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status);
493        u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status);
494        TEST_ASSERT_SUCCESS(status);
495        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
496        TEST_ASSERT_SUCCESS(status);
497        TEST_ASSERT_EQ(0, checkResults);
498
499        u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status);
500        u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status);
501        TEST_ASSERT_SUCCESS(status);
502        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
503        TEST_ASSERT_SUCCESS(status);
504        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
505
506    TEST_TEARDOWN;
507
508
509  /*
510   * getSkeleton
511   */
512
513    TEST_SETUP
514        UChar dest[100];
515        int32_t   skelLength;
516
517        skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, dest, sizeof(dest)/sizeof(UChar), &status);
518        TEST_ASSERT_SUCCESS(status);
519        TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest));
520        TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength);
521
522        skelLength = uspoof_getSkeletonUTF8(sc, USPOOF_ANY_CASE, goodLatinUTF8, -1, (char*)dest,
523                                            sizeof(dest)/sizeof(UChar), &status);
524        TEST_ASSERT_SUCCESS(status);
525
526        skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, NULL, 0, &status);
527        TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status);
528        TEST_ASSERT_EQ(3, skelLength);
529        status = U_ZERO_ERROR;
530
531    TEST_TEARDOWN;
532}
533
534#endif  /* UCONFIG_NO_REGULAR_EXPRESSIONS */
535