1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2009-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6/********************************************************************************
7*
8* File spooftest.c
9*
10*********************************************************************************/
11/*C API TEST for the uspoof Unicode Indentifier Spoofing and Security API */
12/**
13*   This is an API test for ICU spoof detection in plain C.  It doesn't test very many cases, and doesn't
14*   try to test the full functionality.  It just calls each function and verifies that it
15*   works on a basic level.
16*
17*   More complete testing of spoof detection functionality is done with the C++ tests.
18**/
19
20#include "unicode/utypes.h"
21#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION
22
23#include <stdlib.h>
24#include <stdio.h>
25#include <string.h>
26#include "unicode/uspoof.h"
27#include "unicode/ustring.h"
28#include "unicode/uset.h"
29#include "cintltst.h"
30
31#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
32    log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));}}
33
34#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
35log_err("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
36
37#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
38    log_err("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
39             __FILE__, __LINE__, #a, (a), #b, (b)); }}
40
41#define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
42    log_err("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
43             __FILE__, __LINE__, #a, (a), #b, (b)); }}
44
45
46/*
47 *   TEST_SETUP and TEST_TEARDOWN
48 *         macros to handle the boilerplate around setting up test case.
49 *         Put arbitrary test code between SETUP and TEARDOWN.
50 *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
51 */
52#define TEST_SETUP {  \
53    UErrorCode status = U_ZERO_ERROR; \
54    USpoofChecker *sc;     \
55    sc = uspoof_open(&status);  \
56    TEST_ASSERT_SUCCESS(status);   \
57    if (U_SUCCESS(status)){
58
59#define TEST_TEARDOWN  \
60    }  \
61    TEST_ASSERT_SUCCESS(status);  \
62    uspoof_close(sc);  \
63}
64
65
66static void TestUSpoofCAPI(void);
67
68void addUSpoofTest(TestNode** root);
69
70void addUSpoofTest(TestNode** root)
71{
72#if !UCONFIG_NO_FILE_IO
73    addTest(root, &TestUSpoofCAPI, "uspoof/TestUSpoofCAPI");
74#endif
75}
76
77/*
78 *  Identifiers for verifying that spoof checking is minimally alive and working.
79 */
80const UChar goodLatin[] = {(UChar)0x75, (UChar)0x7a, 0};    /* "uz", all ASCII             */
81                                                            /*   (not confusable)          */
82const UChar scMixed[]  = {(UChar)0x73, (UChar)0x0441, 0};   /* "sc", with Cyrillic 'c'     */
83                                                            /*   (mixed script, confusable */
84
85const UChar scLatin[]  = {(UChar)0x73,  (UChar)0x63, 0};    /* "sc", plain ascii.        */
86const UChar goodCyrl[] = {(UChar)0x438, (UChar)0x43B, 0};   /* Plain lower case Cyrillic letters,
87                                                               no latin confusables         */
88
89const UChar goodGreek[]   = {(UChar)0x3c0, (UChar)0x3c6, 0};   /* Plain lower case Greek letters */
90
91const UChar lll_Latin_a[] = {(UChar)0x6c, (UChar)0x49, (UChar)0x31, 0};   /* lI1, all ASCII */
92
93                             /*  Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA*/
94const UChar lll_Latin_b[] = {(UChar)0xff29, (UChar)0x217c, (UChar)0x196, 0};
95
96const UChar lll_Cyrl[]    = {(UChar)0x0406, (UChar)0x04C0, (UChar)0x31, 0};
97
98/* The skeleton transform for all of thes 'lll' lookalikes is all lower case l. */
99const UChar lll_Skel[]    = {(UChar)0x6c, (UChar)0x6c, (UChar)0x6c, 0};
100
101/* Provide better code coverage */
102const char goodLatinUTF8[]    = {0x75, 0x77, 0};
103/*
104 *   Spoof Detction C API Tests
105 */
106static void TestUSpoofCAPI(void) {
107
108    /*
109     *  basic uspoof_open().
110     */
111    {
112        USpoofChecker *sc;
113        UErrorCode  status = U_ZERO_ERROR;
114        sc = uspoof_open(&status);
115        TEST_ASSERT_SUCCESS(status);
116        if (U_FAILURE(status)) {
117            /* If things are so broken that we can't even open a default spoof checker,  */
118            /*   don't even try the rest of the tests.  They would all fail.             */
119            return;
120        }
121        uspoof_close(sc);
122    }
123
124
125
126    /*
127     *  Test Open from source rules.
128    */
129    TEST_SETUP
130    const char *dataSrcDir;
131    char       *fileName;
132    char       *confusables;
133    int         confusablesLength;
134    char       *confusablesWholeScript;
135    int         confusablesWholeScriptLength;
136    FILE       *f;
137    UParseError pe;
138    int32_t     errType;
139    USpoofChecker *rsc;
140
141    dataSrcDir = ctest_dataSrcDir();
142    fileName = malloc(strlen(dataSrcDir) + 100);
143    strcpy(fileName, dataSrcDir);
144    strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusables.txt");
145    f = fopen(fileName, "r");
146    TEST_ASSERT_NE(f, NULL);
147    confusables = malloc(3000000);
148    confusablesLength = fread(confusables, 1, 3000000, f);
149    fclose(f);
150
151
152    strcpy(fileName, dataSrcDir);
153    strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusablesWholeScript.txt");
154    f = fopen(fileName, "r");
155    TEST_ASSERT_NE(f, NULL);
156    confusablesWholeScript = malloc(1000000);
157    confusablesWholeScriptLength = fread(confusablesWholeScript, 1, 1000000, f);
158    fclose(f);
159
160    rsc = uspoof_openFromSource(confusables, confusablesLength,
161                                              confusablesWholeScript, confusablesWholeScriptLength,
162                                              &errType, &pe, &status);
163    TEST_ASSERT_SUCCESS(status);
164
165    free(confusablesWholeScript);
166    free(confusables);
167    free(fileName);
168    uspoof_close(rsc);
169    /*  printf("ParseError Line is %d\n", pe.line);  */
170    TEST_TEARDOWN;
171
172
173    /*
174     * openFromSerialized and serialize
175    */
176    TEST_SETUP
177        int32_t        serializedSize = 0;
178        int32_t        actualLength = 0;
179        char           *buf;
180        USpoofChecker  *sc2;
181        int32_t         checkResults;
182
183
184        serializedSize = uspoof_serialize(sc, NULL, 0, &status);
185        TEST_ASSERT_EQ(status, U_BUFFER_OVERFLOW_ERROR);
186        TEST_ASSERT(serializedSize > 0);
187
188        /* Serialize the default spoof checker */
189        status = U_ZERO_ERROR;
190        buf = (char *)malloc(serializedSize + 10);
191        TEST_ASSERT(buf != NULL);
192        buf[serializedSize] = 42;
193        uspoof_serialize(sc, buf, serializedSize, &status);
194        TEST_ASSERT_SUCCESS(status);
195        TEST_ASSERT_EQ(42, buf[serializedSize]);
196
197        /* Create a new spoof checker from the freshly serialized data */
198        sc2 = uspoof_openFromSerialized(buf, serializedSize+10, &actualLength, &status);
199        TEST_ASSERT_SUCCESS(status);
200        TEST_ASSERT_NE(NULL, sc2);
201        TEST_ASSERT_EQ(serializedSize, actualLength);
202
203        /* Verify that the new spoof checker at least wiggles */
204        checkResults = uspoof_check(sc2, goodLatin, -1, NULL, &status);
205        TEST_ASSERT_SUCCESS(status);
206        TEST_ASSERT_EQ(0, checkResults);
207
208        checkResults = uspoof_check(sc2, scMixed, -1, NULL, &status);
209        TEST_ASSERT_SUCCESS(status);
210        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
211
212        uspoof_close(sc2);
213        free(buf);
214    TEST_TEARDOWN;
215
216
217
218    /*
219     * Set & Get Check Flags
220    */
221    TEST_SETUP
222        int32_t t;
223        uspoof_setChecks(sc, USPOOF_ALL_CHECKS, &status);
224        TEST_ASSERT_SUCCESS(status);
225        t = uspoof_getChecks(sc, &status);
226        TEST_ASSERT_EQ(t, USPOOF_ALL_CHECKS);
227
228        uspoof_setChecks(sc, 0, &status);
229        TEST_ASSERT_SUCCESS(status);
230        t = uspoof_getChecks(sc, &status);
231        TEST_ASSERT_EQ(0, t);
232
233        uspoof_setChecks(sc,
234                        USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE,
235                        &status);
236        TEST_ASSERT_SUCCESS(status);
237        t = uspoof_getChecks(sc, &status);
238        TEST_ASSERT_SUCCESS(status);
239        TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE, t);
240    TEST_TEARDOWN;
241
242    /*
243    * get & setAllowedChars
244    */
245    TEST_SETUP
246        USet *us;
247        const USet *uset;
248
249        uset = uspoof_getAllowedChars(sc, &status);
250        TEST_ASSERT_SUCCESS(status);
251        TEST_ASSERT(uset_isFrozen(uset));
252        us = uset_open((UChar32)0x41, (UChar32)0x5A);   /*  [A-Z]  */
253        uspoof_setAllowedChars(sc, us, &status);
254        TEST_ASSERT_SUCCESS(status);
255        TEST_ASSERT_NE(us, uspoof_getAllowedChars(sc, &status));
256        TEST_ASSERT(uset_equals(us, uspoof_getAllowedChars(sc, &status)));
257        TEST_ASSERT_SUCCESS(status);
258        uset_close(us);
259    TEST_TEARDOWN;
260
261    /*
262    *  clone()
263    */
264
265    TEST_SETUP
266        USpoofChecker *clone1 = NULL;
267        USpoofChecker *clone2 = NULL;
268        int32_t        checkResults = 0;
269
270        clone1 = uspoof_clone(sc, &status);
271        TEST_ASSERT_SUCCESS(status);
272        TEST_ASSERT_NE(clone1, sc);
273
274        clone2 = uspoof_clone(clone1, &status);
275        TEST_ASSERT_SUCCESS(status);
276        TEST_ASSERT_NE(clone2, clone1);
277
278        uspoof_close(clone1);
279
280        /* Verify that the cloned spoof checker is alive */
281        checkResults = uspoof_check(clone2, goodLatin, -1, NULL, &status);
282        TEST_ASSERT_SUCCESS(status);
283        TEST_ASSERT_EQ(0, checkResults);
284
285        checkResults = uspoof_check(clone2, scMixed, -1, NULL, &status);
286        TEST_ASSERT_SUCCESS(status);
287        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
288        uspoof_close(clone2);
289    TEST_TEARDOWN;
290
291    /*
292     *  get & set Checks
293    */
294    TEST_SETUP
295        int32_t   checks;
296        int32_t   checks2;
297        int32_t   checkResults;
298
299        checks = uspoof_getChecks(sc, &status);
300        TEST_ASSERT_SUCCESS(status);
301        TEST_ASSERT_EQ(USPOOF_ALL_CHECKS, checks);
302
303        checks &= ~(USPOOF_SINGLE_SCRIPT | USPOOF_MIXED_SCRIPT_CONFUSABLE);
304        uspoof_setChecks(sc, checks, &status);
305        TEST_ASSERT_SUCCESS(status);
306        checks2 = uspoof_getChecks(sc, &status);
307        TEST_ASSERT_EQ(checks, checks2);
308
309        /* The checks that were disabled just above are the same ones that the "scMixed" test fails.
310            So with those tests gone checking that Identifier should now succeed */
311        checkResults = uspoof_check(sc, scMixed, -1, NULL, &status);
312        TEST_ASSERT_SUCCESS(status);
313        TEST_ASSERT_EQ(0, checkResults);
314    TEST_TEARDOWN;
315
316    /*
317     *  AllowedLoacles
318     */
319
320    TEST_SETUP
321        const char  *allowedLocales;
322        int32_t  checkResults;
323
324        /* Default allowed locales list should be empty */
325        allowedLocales = uspoof_getAllowedLocales(sc, &status);
326        TEST_ASSERT_SUCCESS(status);
327        TEST_ASSERT(strcmp("", allowedLocales) == 0)
328
329        /* Allow en and ru, which should enable Latin and Cyrillic only to pass */
330        uspoof_setAllowedLocales(sc, "en, ru_RU", &status);
331        TEST_ASSERT_SUCCESS(status);
332        allowedLocales = uspoof_getAllowedLocales(sc, &status);
333        TEST_ASSERT_SUCCESS(status);
334        TEST_ASSERT(strstr(allowedLocales, "en") != NULL);
335        TEST_ASSERT(strstr(allowedLocales, "ru") != NULL);
336
337        /* Limit checks to USPOOF_CHAR_LIMIT.  Some of the test data has whole script confusables also,
338         * which we don't want to see in this test. */
339        uspoof_setChecks(sc, USPOOF_CHAR_LIMIT, &status);
340        TEST_ASSERT_SUCCESS(status);
341
342        checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
343        TEST_ASSERT_SUCCESS(status);
344        TEST_ASSERT_EQ(0, checkResults);
345
346        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
347        TEST_ASSERT_SUCCESS(status);
348        TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);
349
350        checkResults = uspoof_check(sc, goodCyrl, -1, NULL, &status);
351        TEST_ASSERT_SUCCESS(status);
352        TEST_ASSERT_EQ(0, checkResults);
353
354        /* Reset with an empty locale list, which should allow all characters to pass */
355        uspoof_setAllowedLocales(sc, " ", &status);
356        TEST_ASSERT_SUCCESS(status);
357
358        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
359        TEST_ASSERT_SUCCESS(status);
360        TEST_ASSERT_EQ(0, checkResults);
361    TEST_TEARDOWN;
362
363    /*
364     * AllowedChars   set/get the USet of allowed characters.
365     */
366    TEST_SETUP
367        const USet  *set;
368        USet        *tmpSet;
369        int32_t      checkResults;
370
371        /* By default, we should see no restriction; the USet should allow all characters. */
372        set = uspoof_getAllowedChars(sc, &status);
373        TEST_ASSERT_SUCCESS(status);
374        tmpSet = uset_open(0, 0x10ffff);
375        TEST_ASSERT(uset_equals(tmpSet, set));
376
377        /* Setting the allowed chars should enable the check. */
378        uspoof_setChecks(sc, USPOOF_ALL_CHECKS & ~USPOOF_CHAR_LIMIT, &status);
379        TEST_ASSERT_SUCCESS(status);
380
381        /* Remove a character that is in our good Latin test identifier from the allowed chars set. */
382        uset_remove(tmpSet, goodLatin[1]);
383        uspoof_setAllowedChars(sc, tmpSet, &status);
384        TEST_ASSERT_SUCCESS(status);
385        uset_close(tmpSet);
386
387        /* Latin Identifier should now fail; other non-latin test cases should still be OK */
388        checkResults = uspoof_check(sc, goodLatin, -1, NULL, &status);
389        TEST_ASSERT_SUCCESS(status);
390        TEST_ASSERT_EQ(USPOOF_CHAR_LIMIT, checkResults);
391
392        checkResults = uspoof_check(sc, goodGreek, -1, NULL, &status);
393        TEST_ASSERT_SUCCESS(status);
394        TEST_ASSERT_EQ(USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
395    TEST_TEARDOWN;
396
397    /*
398     * check UTF-8
399     */
400    TEST_SETUP
401        char    utf8buf[200];
402        int32_t checkResults;
403        int32_t position;
404
405        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodLatin, -1, &status);
406        TEST_ASSERT_SUCCESS(status);
407        position = 666;
408        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
409        TEST_ASSERT_SUCCESS(status);
410        TEST_ASSERT_EQ(0, checkResults);
411        TEST_ASSERT_EQ(666, position);
412
413        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, goodCyrl, -1, &status);
414        TEST_ASSERT_SUCCESS(status);
415        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
416        TEST_ASSERT_SUCCESS(status);
417        TEST_ASSERT_EQ(0, checkResults);
418
419        u_strToUTF8(utf8buf, sizeof(utf8buf), NULL, scMixed, -1, &status);
420        TEST_ASSERT_SUCCESS(status);
421        position = 666;
422        checkResults = uspoof_checkUTF8(sc, utf8buf, -1, &position, &status);
423        TEST_ASSERT_SUCCESS(status);
424        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_SINGLE_SCRIPT , checkResults);
425        TEST_ASSERT_EQ(2, position);
426
427    TEST_TEARDOWN;
428
429    /*
430     * uspoof_areConfusable()
431     */
432    TEST_SETUP
433        int32_t  checkResults;
434
435        checkResults = uspoof_areConfusable(sc, scLatin, -1, scMixed, -1, &status);
436        TEST_ASSERT_SUCCESS(status);
437        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
438
439        checkResults = uspoof_areConfusable(sc, goodGreek, -1, scLatin, -1, &status);
440        TEST_ASSERT_SUCCESS(status);
441        TEST_ASSERT_EQ(0, checkResults);
442
443        checkResults = uspoof_areConfusable(sc, lll_Latin_a, -1, lll_Latin_b, -1, &status);
444        TEST_ASSERT_SUCCESS(status);
445        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
446
447    TEST_TEARDOWN;
448
449    /*
450     * areConfusableUTF8
451     */
452    TEST_SETUP
453        int32_t checkResults;
454        char s1[200];
455        char s2[200];
456
457
458        u_strToUTF8(s1, sizeof(s1), NULL, scLatin, -1, &status);
459        u_strToUTF8(s2, sizeof(s2), NULL, scMixed, -1, &status);
460        TEST_ASSERT_SUCCESS(status);
461        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
462        TEST_ASSERT_SUCCESS(status);
463        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE, checkResults);
464
465        u_strToUTF8(s1, sizeof(s1), NULL, goodGreek, -1, &status);
466        u_strToUTF8(s2, sizeof(s2), NULL, scLatin, -1, &status);
467        TEST_ASSERT_SUCCESS(status);
468        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
469        TEST_ASSERT_SUCCESS(status);
470        TEST_ASSERT_EQ(0, checkResults);
471
472        u_strToUTF8(s1, sizeof(s1), NULL, lll_Latin_a, -1, &status);
473        u_strToUTF8(s2, sizeof(s2), NULL, lll_Latin_b, -1, &status);
474        TEST_ASSERT_SUCCESS(status);
475        checkResults = uspoof_areConfusableUTF8(sc, s1, -1, s2, -1, &status);
476        TEST_ASSERT_SUCCESS(status);
477        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, checkResults);
478
479    TEST_TEARDOWN;
480
481
482  /*
483   * getSkeleton
484   */
485
486    TEST_SETUP
487        UChar dest[100];
488        int32_t   skelLength;
489
490        skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, dest, sizeof(dest)/sizeof(UChar), &status);
491        TEST_ASSERT_SUCCESS(status);
492        TEST_ASSERT_EQ(0, u_strcmp(lll_Skel, dest));
493        TEST_ASSERT_EQ(u_strlen(lll_Skel), skelLength);
494
495        skelLength = uspoof_getSkeletonUTF8(sc, USPOOF_ANY_CASE, goodLatinUTF8, -1, (char*)dest,
496                                            sizeof(dest)/sizeof(UChar), &status);
497        TEST_ASSERT_SUCCESS(status);
498
499        skelLength = uspoof_getSkeleton(sc, USPOOF_ANY_CASE, lll_Latin_a, -1, NULL, 0, &status);
500        TEST_ASSERT_EQ(U_BUFFER_OVERFLOW_ERROR, status);
501        TEST_ASSERT_EQ(3, skelLength);
502        status = U_ZERO_ERROR;
503
504    TEST_TEARDOWN;
505}
506
507#endif  /* UCONFIG_NO_REGULAR_EXPRESSIONS */
508