1/*
2***************************************************************************
3* Copyright (C) 2008-2014, International Business Machines Corporation
4* and others. All Rights Reserved.
5***************************************************************************
6*   file name:  uspoof.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2008Feb13
12*   created by: Andy Heninger
13*
14*   Unicode Spoof Detection
15*/
16#include "unicode/utypes.h"
17#include "unicode/normalizer2.h"
18#include "unicode/uspoof.h"
19#include "unicode/ustring.h"
20#include "unicode/utf16.h"
21#include "cmemory.h"
22#include "cstring.h"
23#include "identifier_info.h"
24#include "mutex.h"
25#include "scriptset.h"
26#include "uassert.h"
27#include "ucln_in.h"
28#include "uspoof_impl.h"
29#include "umutex.h"
30
31
32#if !UCONFIG_NO_NORMALIZATION
33
34U_NAMESPACE_USE
35
36
37//
38// Static Objects used by the spoof impl, their thread safe initialization and their cleanup.
39//
40static UnicodeSet *gInclusionSet = NULL;
41static UnicodeSet *gRecommendedSet = NULL;
42static const Normalizer2 *gNfdNormalizer = NULL;
43static UInitOnce gSpoofInitOnce = U_INITONCE_INITIALIZER;
44
45static UBool U_CALLCONV
46uspoof_cleanup(void) {
47    delete gInclusionSet;
48    gInclusionSet = NULL;
49    delete gRecommendedSet;
50    gRecommendedSet = NULL;
51    gNfdNormalizer = NULL;
52    gSpoofInitOnce.reset();
53    return TRUE;
54}
55
56static void U_CALLCONV initializeStatics(UErrorCode &status) {
57    static const char *inclusionPat =
58           "[\\u0027\\u002d-\\u002e\\u003A\\u00B7\\u0375\\u058A\\u05F3-\\u05F4"
59           "\\u06FD-\\u06FE\\u0F0B\\u200C-\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]";
60    gInclusionSet = new UnicodeSet(UnicodeString(inclusionPat, -1, US_INV), status);
61
62    // Note: data from http://unicode.org/Public/security/latest/xidmodifications.txt version 6.3.0
63    // Note: concatenated string constants do not work with UNICODE_STRING_SIMPLE on all platforms.
64    static const char *recommendedPat =
65            "[\\u0030-\\u0039\\u0041-\\u005A\\u005F\\u0061-\\u007A\\u00C0-\\u00D6\\u00D8-\\u00F6"
66            "\\u00F8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u01A0-\\u01A1\\u01AF-\\u01B0"
67            "\\u01CD-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F"
68            "\\u0226-\\u0233\\u0259\\u02BB-\\u02BC\\u02EC\\u0300-\\u0304\\u0306-\\u030C\\u030F-\\u0311"
69            "\\u0313-\\u0314\\u031B\\u0323-\\u0328\\u032D-\\u032E\\u0330-\\u0331\\u0335\\u0338-\\u0339"
70            "\\u0342\\u0345\\u037B-\\u037D\\u0386\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE"
71            "\\u03FC-\\u045F\\u048A-\\u0527\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05B4\\u05D0-\\u05EA"
72            "\\u05F0-\\u05F2\\u0620-\\u063F\\u0641-\\u0655\\u0660-\\u0669\\u0670-\\u0672\\u0674"
73            "\\u0679-\\u068D\\u068F-\\u06D3\\u06D5\\u06E5-\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1"
74            "\\u08A0\\u08A2-\\u08AC\\u0901-\\u094D\\u094F-\\u0950\\u0956-\\u0957\\u0960-\\u0963"
75            "\\u0966-\\u096F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F-\\u0990"
76            "\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7-\\u09C8"
77            "\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u0A01-\\u0A03\\u0A05-\\u0A0A"
78            "\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38-\\u0A39\\u0A3C"
79            "\\u0A3E-\\u0A42\\u0A47-\\u0A48\\u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83"
80            "\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9"
81            "\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF"
82            "\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32-\\u0B33"
83            "\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47-\\u0B48\\u0B4B-\\u0B4D\\u0B56-\\u0B57\\u0B5F-\\u0B61"
84            "\\u0B66-\\u0B6F\\u0B71\\u0B82-\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95"
85            "\\u0B99-\\u0B9A\\u0B9C\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9"
86            "\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C03"
87            "\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44"
88            "\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55-\\u0C56\\u0C60-\\u0C61\\u0C66-\\u0C6F\\u0C82-\\u0C83"
89            "\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4"
90            "\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5-\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1-\\u0CF2"
91            "\\u0D02-\\u0D03\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48"
92            "\\u0D4A-\\u0D4E\\u0D57\\u0D60-\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82-\\u0D83"
93            "\\u0D85-\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD"
94            "\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32"
95            "\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59\\u0E81-\\u0E82\\u0E84\\u0E87-\\u0E88"
96            "\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA-\\u0EAB"
97            "\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD"
98            "\\u0ED0-\\u0ED9\\u0EDE-\\u0EDF\\u0F00\\u0F20-\\u0F29\\u0F35\\u0F37\\u0F3E-\\u0F42"
99            "\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F56\\u0F58-\\u0F5B\\u0F5D-\\u0F68"
100            "\\u0F6A-\\u0F6C\\u0F71-\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0F82-\\u0F84\\u0F86-\\u0F92"
101            "\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8"
102            "\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10C7\\u10CD\\u10D0-\\u10F0"
103            "\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258"
104            "\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE"
105            "\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A"
106            "\\u135D-\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA"
107            "\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1E00-\\u1E99\\u1EBF\\u1F00-\\u1F15\\u1F18-\\u1F1D"
108            "\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70"
109            "\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4"
110            "\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA"
111            "\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6"
112            "\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6"
113            "\\u2DD8-\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099-\\u309A\\u309D-\\u309E\\u30A1-\\u30FA"
114            "\\u30FC-\\u30FE\\u3105-\\u312D\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-\\u9FCC\\uA660-\\uA661"
115            "\\uA674-\\uA67B\\uA67F\\uA69F\\uA717-\\uA71F\\uA788\\uA78D-\\uA78E\\uA790-\\uA793"
116            "\\uA7A0-\\uA7AA\\uA7FA\\uA9CF\\uAA60-\\uAA76\\uAA7A-\\uAA7B\\uAB01-\\uAB06\\uAB09-\\uAB0E"
117            "\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E-\\uFA0F\\uFA11"
118            "\\uFA13-\\uFA14\\uFA1F\\uFA21\\uFA23-\\uFA24\\uFA27-\\uFA29\\U0001B000-\\U0001B001\\U00020000-\\U0002A6D6"
119            "\\U0002A700-\\U0002B734\\U0002B740-\\U0002B81D]";
120    gRecommendedSet = new UnicodeSet(UnicodeString(recommendedPat, -1, US_INV), status);
121    gNfdNormalizer = Normalizer2::getNFDInstance(status);
122    ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup);
123}
124
125
126U_CAPI USpoofChecker * U_EXPORT2
127uspoof_open(UErrorCode *status) {
128    if (U_FAILURE(*status)) {
129        return NULL;
130    }
131    umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
132    SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
133    if (U_FAILURE(*status)) {
134        delete si;
135        si = NULL;
136    }
137    return reinterpret_cast<USpoofChecker *>(si);
138}
139
140
141U_CAPI USpoofChecker * U_EXPORT2
142uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
143                          UErrorCode *status) {
144    if (U_FAILURE(*status)) {
145        return NULL;
146    }
147    umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
148    SpoofData *sd = new SpoofData(data, length, *status);
149    SpoofImpl *si = new SpoofImpl(sd, *status);
150    if (U_FAILURE(*status)) {
151        delete sd;
152        delete si;
153        return NULL;
154    }
155    if (sd == NULL || si == NULL) {
156        *status = U_MEMORY_ALLOCATION_ERROR;
157        delete sd;
158        delete si;
159        return NULL;
160    }
161
162    if (pActualLength != NULL) {
163        *pActualLength = sd->fRawData->fLength;
164    }
165    return reinterpret_cast<USpoofChecker *>(si);
166}
167
168
169U_CAPI USpoofChecker * U_EXPORT2
170uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
171    const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
172    if (src == NULL) {
173        return NULL;
174    }
175    SpoofImpl *result = new SpoofImpl(*src, *status);   // copy constructor
176    if (U_FAILURE(*status)) {
177        delete result;
178        result = NULL;
179    }
180    return reinterpret_cast<USpoofChecker *>(result);
181}
182
183
184U_CAPI void U_EXPORT2
185uspoof_close(USpoofChecker *sc) {
186    UErrorCode status = U_ZERO_ERROR;
187    SpoofImpl *This = SpoofImpl::validateThis(sc, status);
188    delete This;
189}
190
191
192U_CAPI void U_EXPORT2
193uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
194    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
195    if (This == NULL) {
196        return;
197    }
198
199    // Verify that the requested checks are all ones (bits) that
200    //   are acceptable, known values.
201    if (checks & ~(USPOOF_ALL_CHECKS | USPOOF_AUX_INFO)) {
202        *status = U_ILLEGAL_ARGUMENT_ERROR;
203        return;
204    }
205
206    This->fChecks = checks;
207}
208
209
210U_CAPI int32_t U_EXPORT2
211uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
212    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
213    if (This == NULL) {
214        return 0;
215    }
216    return This->fChecks;
217}
218
219U_CAPI void U_EXPORT2
220uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel) {
221    UErrorCode status = U_ZERO_ERROR;
222    SpoofImpl *This = SpoofImpl::validateThis(sc, status);
223    if (This != NULL) {
224        This->fRestrictionLevel = restrictionLevel;
225    }
226}
227
228U_CAPI URestrictionLevel U_EXPORT2
229uspoof_getRestrictionLevel(const USpoofChecker *sc) {
230    UErrorCode status = U_ZERO_ERROR;
231    const SpoofImpl *This = SpoofImpl::validateThis(sc, status);
232    if (This == NULL) {
233        return USPOOF_UNRESTRICTIVE;
234    }
235    return This->fRestrictionLevel;
236}
237
238U_CAPI void U_EXPORT2
239uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
240    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
241    if (This == NULL) {
242        return;
243    }
244    This->setAllowedLocales(localesList, *status);
245}
246
247U_CAPI const char * U_EXPORT2
248uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
249    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
250    if (This == NULL) {
251        return NULL;
252    }
253    return This->getAllowedLocales(*status);
254}
255
256
257U_CAPI const USet * U_EXPORT2
258uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
259    const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
260    return result->toUSet();
261}
262
263U_CAPI const UnicodeSet * U_EXPORT2
264uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
265    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
266    if (This == NULL) {
267        return NULL;
268    }
269    return This->fAllowedCharsSet;
270}
271
272
273U_CAPI void U_EXPORT2
274uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
275    const UnicodeSet *set = UnicodeSet::fromUSet(chars);
276    uspoof_setAllowedUnicodeSet(sc, set, status);
277}
278
279
280U_CAPI void U_EXPORT2
281uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
282    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
283    if (This == NULL) {
284        return;
285    }
286    if (chars->isBogus()) {
287        *status = U_ILLEGAL_ARGUMENT_ERROR;
288        return;
289    }
290    UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
291    if (clonedSet == NULL || clonedSet->isBogus()) {
292        *status = U_MEMORY_ALLOCATION_ERROR;
293        return;
294    }
295    clonedSet->freeze();
296    delete This->fAllowedCharsSet;
297    This->fAllowedCharsSet = clonedSet;
298    This->fChecks |= USPOOF_CHAR_LIMIT;
299}
300
301
302U_CAPI int32_t U_EXPORT2
303uspoof_check(const USpoofChecker *sc,
304             const UChar *id, int32_t length,
305             int32_t *position,
306             UErrorCode *status) {
307
308    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
309    if (This == NULL) {
310        return 0;
311    }
312    if (length < -1) {
313        *status = U_ILLEGAL_ARGUMENT_ERROR;
314        return 0;
315    }
316    UnicodeString idStr((length == -1), id, length);  // Aliasing constructor.
317    int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
318    return result;
319}
320
321
322U_CAPI int32_t U_EXPORT2
323uspoof_checkUTF8(const USpoofChecker *sc,
324                 const char *id, int32_t length,
325                 int32_t *position,
326                 UErrorCode *status) {
327
328    if (U_FAILURE(*status)) {
329        return 0;
330    }
331    UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
332    int32_t result = uspoof_checkUnicodeString(sc, idStr, position, status);
333    return result;
334}
335
336
337U_CAPI int32_t U_EXPORT2
338uspoof_areConfusable(const USpoofChecker *sc,
339                     const UChar *id1, int32_t length1,
340                     const UChar *id2, int32_t length2,
341                     UErrorCode *status) {
342    SpoofImpl::validateThis(sc, *status);
343    if (U_FAILURE(*status)) {
344        return 0;
345    }
346    if (length1 < -1 || length2 < -1) {
347        *status = U_ILLEGAL_ARGUMENT_ERROR;
348        return 0;
349    }
350
351    UnicodeString id1Str((length1==-1), id1, length1);  // Aliasing constructor
352    UnicodeString id2Str((length2==-1), id2, length2);  // Aliasing constructor
353    return uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
354}
355
356
357U_CAPI int32_t U_EXPORT2
358uspoof_areConfusableUTF8(const USpoofChecker *sc,
359                         const char *id1, int32_t length1,
360                         const char *id2, int32_t length2,
361                         UErrorCode *status) {
362    SpoofImpl::validateThis(sc, *status);
363    if (U_FAILURE(*status)) {
364        return 0;
365    }
366    if (length1 < -1 || length2 < -1) {
367        *status = U_ILLEGAL_ARGUMENT_ERROR;
368        return 0;
369    }
370    UnicodeString id1Str = UnicodeString::fromUTF8(StringPiece(id1, length1>=0? length1 : uprv_strlen(id1)));
371    UnicodeString id2Str = UnicodeString::fromUTF8(StringPiece(id2, length2>=0? length2 : uprv_strlen(id2)));
372    int32_t results = uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status);
373    return results;
374}
375
376
377U_CAPI int32_t U_EXPORT2
378uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
379                                  const icu::UnicodeString &id1,
380                                  const icu::UnicodeString &id2,
381                                  UErrorCode *status) {
382    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
383    if (U_FAILURE(*status)) {
384        return 0;
385    }
386    //
387    // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
388    //   and for definitions of the types (single, whole, mixed-script) of confusables.
389
390    // We only care about a few of the check flags.  Ignore the others.
391    // If no tests relavant to this function have been specified, return an error.
392    // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
393    //        but logically we would just return 0 (no error).
394    if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
395                          USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
396        *status = U_INVALID_STATE_ERROR;
397        return 0;
398    }
399    int32_t  flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
400
401    int32_t  result = 0;
402    IdentifierInfo *identifierInfo = This->getIdentifierInfo(*status);
403    if (U_FAILURE(*status)) {
404        return 0;
405    }
406    identifierInfo->setIdentifier(id1, *status);
407    int32_t id1ScriptCount = identifierInfo->getScriptCount();
408    identifierInfo->setIdentifier(id2, *status);
409    int32_t id2ScriptCount = identifierInfo->getScriptCount();
410    This->releaseIdentifierInfo(identifierInfo);
411    identifierInfo = NULL;
412
413    if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
414        UnicodeString   id1Skeleton;
415        UnicodeString   id2Skeleton;
416        if (id1ScriptCount <= 1 && id2ScriptCount <= 1) {
417            flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
418            uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
419            uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
420            if (id1Skeleton == id2Skeleton) {
421                result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
422            }
423        }
424    }
425
426    if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
427         // If the two inputs are single script confusable they cannot also be
428         // mixed or whole script confusable, according to the UAX39 definitions.
429         // So we can skip those tests.
430         return result;
431    }
432
433    // Two identifiers are whole script confusable if each is of a single script
434    // and they are mixed script confusable.
435    UBool possiblyWholeScriptConfusables =
436        id1ScriptCount <= 1 && id2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
437
438    //
439    // Mixed Script Check
440    //
441    if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
442        // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
443        // the mixed script table skeleton, which is what we want.
444        // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
445        UnicodeString id1Skeleton;
446        UnicodeString id2Skeleton;
447        flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
448        uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id1, id1Skeleton, status);
449        uspoof_getSkeletonUnicodeString(sc, flagsForSkeleton, id2, id2Skeleton, status);
450        if (id1Skeleton == id2Skeleton) {
451            result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
452            if (possiblyWholeScriptConfusables) {
453                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
454            }
455        }
456    }
457
458    return result;
459}
460
461
462
463
464U_CAPI int32_t U_EXPORT2
465uspoof_checkUnicodeString(const USpoofChecker *sc,
466                          const icu::UnicodeString &id,
467                          int32_t *position,
468                          UErrorCode *status) {
469    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
470    if (This == NULL) {
471        return 0;
472    }
473    int32_t result = 0;
474
475    IdentifierInfo *identifierInfo = NULL;
476    if ((This->fChecks) & (USPOOF_RESTRICTION_LEVEL | USPOOF_MIXED_NUMBERS)) {
477        identifierInfo = This->getIdentifierInfo(*status);
478        if (U_FAILURE(*status)) {
479            goto cleanupAndReturn;
480        }
481        identifierInfo->setIdentifier(id, *status);
482        identifierInfo->setIdentifierProfile(*This->fAllowedCharsSet);
483    }
484
485
486    if ((This->fChecks) & USPOOF_RESTRICTION_LEVEL) {
487        URestrictionLevel idRestrictionLevel = identifierInfo->getRestrictionLevel(*status);
488        if (idRestrictionLevel > This->fRestrictionLevel) {
489            result |= USPOOF_RESTRICTION_LEVEL;
490        }
491        if (This->fChecks & USPOOF_AUX_INFO) {
492            result |= idRestrictionLevel;
493        }
494    }
495
496    if ((This->fChecks) & USPOOF_MIXED_NUMBERS) {
497        const UnicodeSet *numerics = identifierInfo->getNumerics();
498        if (numerics->size() > 1) {
499            result |= USPOOF_MIXED_NUMBERS;
500        }
501
502        // TODO: ICU4J returns the UnicodeSet of the numerics found in the identifier.
503        //       We have no easy way to do the same in C.
504        // if (checkResult != null) {
505        //     checkResult.numerics = numerics;
506        // }
507    }
508
509
510    if (This->fChecks & (USPOOF_CHAR_LIMIT)) {
511        int32_t i;
512        UChar32 c;
513        int32_t length = id.length();
514        for (i=0; i<length ;) {
515            c = id.char32At(i);
516            i += U16_LENGTH(c);
517            if (!This->fAllowedCharsSet->contains(c)) {
518                result |= USPOOF_CHAR_LIMIT;
519                break;
520            }
521        }
522    }
523
524    if (This->fChecks &
525        (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
526        // These are the checks that need to be done on NFD input
527        UnicodeString nfdText;
528        gNfdNormalizer->normalize(id, nfdText, *status);
529        int32_t nfdLength = nfdText.length();
530
531        if (This->fChecks & USPOOF_INVISIBLE) {
532
533            // scan for more than one occurence of the same non-spacing mark
534            // in a sequence of non-spacing marks.
535            int32_t     i;
536            UChar32     c;
537            UChar32     firstNonspacingMark = 0;
538            UBool       haveMultipleMarks = FALSE;
539            UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
540
541            for (i=0; i<nfdLength ;) {
542                c = nfdText.char32At(i);
543                i += U16_LENGTH(c);
544                if (u_charType(c) != U_NON_SPACING_MARK) {
545                    firstNonspacingMark = 0;
546                    if (haveMultipleMarks) {
547                        marksSeenSoFar.clear();
548                        haveMultipleMarks = FALSE;
549                    }
550                    continue;
551                }
552                if (firstNonspacingMark == 0) {
553                    firstNonspacingMark = c;
554                    continue;
555                }
556                if (!haveMultipleMarks) {
557                    marksSeenSoFar.add(firstNonspacingMark);
558                    haveMultipleMarks = TRUE;
559                }
560                if (marksSeenSoFar.contains(c)) {
561                    // report the error, and stop scanning.
562                    // No need to find more than the first failure.
563                    result |= USPOOF_INVISIBLE;
564                    break;
565                }
566                marksSeenSoFar.add(c);
567            }
568        }
569
570
571        if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
572            // The basic test is the same for both whole and mixed script confusables.
573            // Compute the set of scripts that every input character has a confusable in.
574            // For this computation an input character is always considered to be
575            // confusable with itself in its own script.
576            //
577            // If the number of such scripts is two or more, and the input consisted of
578            // characters all from a single script, we have a whole script confusable.
579            // (The two scripts will be the original script and the one that is confusable)
580            //
581            // If the number of such scripts >= one, and the original input contained characters from
582            // more than one script, we have a mixed script confusable.  (We can transform
583            // some of the characters, and end up with a visually similar string all in
584            // one script.)
585
586            if (identifierInfo == NULL) {
587                identifierInfo = This->getIdentifierInfo(*status);
588                if (U_FAILURE(*status)) {
589                    goto cleanupAndReturn;
590                }
591                identifierInfo->setIdentifier(id, *status);
592            }
593
594            int32_t scriptCount = identifierInfo->getScriptCount();
595
596            ScriptSet scripts;
597            This->wholeScriptCheck(nfdText, &scripts, *status);
598            int32_t confusableScriptCount = scripts.countMembers();
599            //printf("confusableScriptCount = %d\n", confusableScriptCount);
600
601            if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
602                confusableScriptCount >= 2 &&
603                scriptCount == 1) {
604                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
605            }
606
607            if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
608                confusableScriptCount >= 1 &&
609                scriptCount > 1) {
610                result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
611            }
612        }
613    }
614
615cleanupAndReturn:
616    This->releaseIdentifierInfo(identifierInfo);
617    if (position != NULL) {
618        *position = 0;
619    }
620    return result;
621}
622
623
624U_CAPI int32_t U_EXPORT2
625uspoof_getSkeleton(const USpoofChecker *sc,
626                   uint32_t type,
627                   const UChar *id,  int32_t length,
628                   UChar *dest, int32_t destCapacity,
629                   UErrorCode *status) {
630
631    SpoofImpl::validateThis(sc, *status);
632    if (U_FAILURE(*status)) {
633        return 0;
634    }
635    if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) {
636        *status = U_ILLEGAL_ARGUMENT_ERROR;
637        return 0;
638    }
639
640    UnicodeString idStr((length==-1), id, length);  // Aliasing constructor
641    UnicodeString destStr;
642    uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status);
643    destStr.extract(dest, destCapacity, *status);
644    return destStr.length();
645}
646
647
648
649U_I18N_API UnicodeString &  U_EXPORT2
650uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
651                                uint32_t type,
652                                const UnicodeString &id,
653                                UnicodeString &dest,
654                                UErrorCode *status) {
655    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
656    if (U_FAILURE(*status)) {
657        return dest;
658    }
659
660   int32_t tableMask = 0;
661   switch (type) {
662      case 0:
663        tableMask = USPOOF_ML_TABLE_FLAG;
664        break;
665      case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
666        tableMask = USPOOF_SL_TABLE_FLAG;
667        break;
668      case USPOOF_ANY_CASE:
669        tableMask = USPOOF_MA_TABLE_FLAG;
670        break;
671      case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
672        tableMask = USPOOF_SA_TABLE_FLAG;
673        break;
674      default:
675        *status = U_ILLEGAL_ARGUMENT_ERROR;
676        return dest;
677    }
678
679    UnicodeString nfdId;
680    gNfdNormalizer->normalize(id, nfdId, *status);
681
682    // Apply the skeleton mapping to the NFD normalized input string
683    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
684    int32_t inputIndex = 0;
685    UnicodeString skelStr;
686    int32_t normalizedLen = nfdId.length();
687    for (inputIndex=0; inputIndex < normalizedLen; ) {
688        UChar32 c = nfdId.char32At(inputIndex);
689        inputIndex += U16_LENGTH(c);
690        This->confusableLookup(c, tableMask, skelStr);
691    }
692
693    gNfdNormalizer->normalize(skelStr, dest, *status);
694    return dest;
695}
696
697
698U_CAPI int32_t U_EXPORT2
699uspoof_getSkeletonUTF8(const USpoofChecker *sc,
700                       uint32_t type,
701                       const char *id,  int32_t length,
702                       char *dest, int32_t destCapacity,
703                       UErrorCode *status) {
704    SpoofImpl::validateThis(sc, *status);
705    if (U_FAILURE(*status)) {
706        return 0;
707    }
708    if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) {
709        *status = U_ILLEGAL_ARGUMENT_ERROR;
710        return 0;
711    }
712
713    UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : uprv_strlen(id)));
714    UnicodeString destStr;
715    uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status);
716    if (U_FAILURE(*status)) {
717        return 0;
718    }
719
720    int32_t lengthInUTF8 = 0;
721    u_strToUTF8(dest, destCapacity, &lengthInUTF8,
722                destStr.getBuffer(), destStr.length(), status);
723    return lengthInUTF8;
724}
725
726
727U_CAPI int32_t U_EXPORT2
728uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
729    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
730    if (This == NULL) {
731        U_ASSERT(U_FAILURE(*status));
732        return 0;
733    }
734    int32_t dataSize = This->fSpoofData->fRawData->fLength;
735    if (capacity < dataSize) {
736        *status = U_BUFFER_OVERFLOW_ERROR;
737        return dataSize;
738    }
739    uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
740    return dataSize;
741}
742
743U_CAPI const USet * U_EXPORT2
744uspoof_getInclusionSet(UErrorCode *status) {
745    umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
746    return gInclusionSet->toUSet();
747}
748
749U_CAPI const USet * U_EXPORT2
750uspoof_getRecommendedSet(UErrorCode *status) {
751    umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
752    return gRecommendedSet->toUSet();
753}
754
755U_I18N_API const UnicodeSet * U_EXPORT2
756uspoof_getInclusionUnicodeSet(UErrorCode *status) {
757    umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
758    return gInclusionSet;
759}
760
761U_I18N_API const UnicodeSet * U_EXPORT2
762uspoof_getRecommendedUnicodeSet(UErrorCode *status) {
763    umtx_initOnce(gSpoofInitOnce, &initializeStatics, *status);
764    return gRecommendedSet;
765}
766
767
768
769#endif // !UCONFIG_NO_NORMALIZATION
770