1/*
2***************************************************************************
3* Copyright (C) 2008-2009, International Business Machines Corporation
4* and others. All Rights Reserved.
5***************************************************************************
6*   file name:  uspoof.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2008Feb13
12*   created by: Andy Heninger
13*
14*   Unicode Spoof Detection
15*/
16#include "unicode/utypes.h"
17#include "unicode/uspoof.h"
18#include "unicode/unorm.h"
19#include "unicode/ustring.h"
20#include "cmemory.h"
21#include "uspoof_impl.h"
22#include "uassert.h"
23
24
25#if !UCONFIG_NO_NORMALIZATION
26
27
28#include <stdio.h>      // debug
29
30U_NAMESPACE_USE
31
32
33U_CAPI USpoofChecker * U_EXPORT2
34uspoof_open(UErrorCode *status) {
35    if (U_FAILURE(*status)) {
36        return NULL;
37    }
38    SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
39    if (U_FAILURE(*status)) {
40        delete si;
41        si = NULL;
42    }
43    return (USpoofChecker *)si;
44}
45
46
47U_CAPI USpoofChecker * U_EXPORT2
48uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
49                          UErrorCode *status) {
50    if (U_FAILURE(*status)) {
51        return NULL;
52    }
53    SpoofData *sd = new SpoofData(data, length, *status);
54    SpoofImpl *si = new SpoofImpl(sd, *status);
55    if (U_FAILURE(*status)) {
56        delete sd;
57        delete si;
58        return NULL;
59    }
60    if (sd == NULL || si == NULL) {
61        *status = U_MEMORY_ALLOCATION_ERROR;
62        delete sd;
63        delete si;
64        return NULL;
65    }
66
67    if (pActualLength != NULL) {
68        *pActualLength = sd->fRawData->fLength;
69    }
70    return reinterpret_cast<USpoofChecker *>(si);
71}
72
73
74U_CAPI USpoofChecker * U_EXPORT2
75uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
76    const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
77    if (src == NULL) {
78        return NULL;
79    }
80    SpoofImpl *result = new SpoofImpl(*src, *status);   // copy constructor
81    if (U_FAILURE(*status)) {
82        delete result;
83        result = NULL;
84    }
85    return (USpoofChecker *)result;
86}
87
88
89U_CAPI void U_EXPORT2
90uspoof_close(USpoofChecker *sc) {
91    UErrorCode status = U_ZERO_ERROR;
92    SpoofImpl *This = SpoofImpl::validateThis(sc, status);
93    delete This;
94}
95
96
97U_CAPI void U_EXPORT2
98uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
99    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
100    if (This == NULL) {
101        return;
102    }
103
104    // Verify that the requested checks are all ones (bits) that
105    //   are acceptable, known values.
106    if (checks & ~USPOOF_ALL_CHECKS) {
107        *status = U_ILLEGAL_ARGUMENT_ERROR;
108        return;
109    }
110
111    This->fChecks = checks;
112}
113
114
115U_CAPI int32_t U_EXPORT2
116uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
117    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
118    if (This == NULL) {
119        return 0;
120    }
121    return This->fChecks;
122}
123
124U_CAPI void U_EXPORT2
125uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
126    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
127    if (This == NULL) {
128        return;
129    }
130    This->setAllowedLocales(localesList, *status);
131}
132
133U_CAPI const char * U_EXPORT2
134uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
135    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
136    if (This == NULL) {
137        return NULL;
138    }
139    return This->getAllowedLocales(*status);
140}
141
142
143U_CAPI const USet * U_EXPORT2
144uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
145    const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
146    return reinterpret_cast<const USet *>(result);
147}
148
149U_CAPI const UnicodeSet * U_EXPORT2
150uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
151    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
152    if (This == NULL) {
153        return NULL;
154    }
155    return This->fAllowedCharsSet;
156}
157
158
159U_CAPI void U_EXPORT2
160uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
161    const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
162    uspoof_setAllowedUnicodeSet(sc, set, status);
163}
164
165
166U_CAPI void U_EXPORT2
167uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
168    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
169    if (This == NULL) {
170        return;
171    }
172    if (chars->isBogus()) {
173        *status = U_ILLEGAL_ARGUMENT_ERROR;
174        return;
175    }
176    UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
177    if (clonedSet == NULL || clonedSet->isBogus()) {
178        *status = U_MEMORY_ALLOCATION_ERROR;
179        return;
180    }
181    clonedSet->freeze();
182    delete This->fAllowedCharsSet;
183    This->fAllowedCharsSet = clonedSet;
184    This->fChecks |= USPOOF_CHAR_LIMIT;
185}
186
187
188U_CAPI int32_t U_EXPORT2
189uspoof_check(const USpoofChecker *sc,
190             const UChar *text, int32_t length,
191             int32_t *position,
192             UErrorCode *status) {
193
194    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
195    if (This == NULL) {
196        return 0;
197    }
198    if (length < -1) {
199        *status = U_ILLEGAL_ARGUMENT_ERROR;
200        return 0;
201    }
202    if (length == -1) {
203        // It's not worth the bother to handle nul terminated strings everywhere.
204        //   Just get the length and be done with it.
205        length = u_strlen(text);
206    }
207
208    int32_t result = 0;
209    int32_t failPos = 0x7fffffff;   // TODO: do we have a #define for max int32?
210
211    // A count of the number of non-Common or inherited scripts.
212    // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
213    // Share the computation when possible.  scriptCount == -1 means that we haven't
214    // done it yet.
215    int32_t scriptCount = -1;
216
217    if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
218        scriptCount = This->scriptScan(text, length, failPos, *status);
219        // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
220        if ( scriptCount >= 2) {
221            // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
222            result |= USPOOF_SINGLE_SCRIPT;
223        }
224    }
225
226    if (This->fChecks & USPOOF_CHAR_LIMIT) {
227        int32_t i;
228        UChar32 c;
229        for (i=0; i<length ;) {
230            U16_NEXT(text, i, length, c);
231            if (!This->fAllowedCharsSet->contains(c)) {
232                result |= USPOOF_CHAR_LIMIT;
233                if (i < failPos) {
234                    failPos = i;
235                }
236                break;
237            }
238        }
239    }
240
241    if (This->fChecks &
242        (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
243        // These are the checks that need to be done on NFKD input
244        NFKDBuffer   normalizedInput(text, length, *status);
245        const UChar  *nfkdText = normalizedInput.getBuffer();
246        int32_t      nfkdLength = normalizedInput.getLength();
247
248        if (This->fChecks & USPOOF_INVISIBLE) {
249
250            // scan for more than one occurence of the same non-spacing mark
251            // in a sequence of non-spacing marks.
252            int32_t     i;
253            UChar32     c;
254            UChar32     firstNonspacingMark = 0;
255            UBool       haveMultipleMarks = FALSE;
256            UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
257
258            for (i=0; i<length ;) {
259                U16_NEXT(nfkdText, i, nfkdLength, c);
260                if (u_charType(c) != U_NON_SPACING_MARK) {
261                    firstNonspacingMark = 0;
262                    if (haveMultipleMarks) {
263                        marksSeenSoFar.clear();
264                        haveMultipleMarks = FALSE;
265                    }
266                    continue;
267                }
268                if (firstNonspacingMark == 0) {
269                    firstNonspacingMark = c;
270                    continue;
271                }
272                if (!haveMultipleMarks) {
273                    marksSeenSoFar.add(firstNonspacingMark);
274                    haveMultipleMarks = TRUE;
275                }
276                if (marksSeenSoFar.contains(c)) {
277                    // report the error, and stop scanning.
278                    // No need to find more than the first failure.
279                    result |= USPOOF_INVISIBLE;
280                    failPos = i;
281                    break;
282                }
283                marksSeenSoFar.add(c);
284            }
285        }
286
287
288        if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
289            // The basic test is the same for both whole and mixed script confusables.
290            // Compute the set of scripts that every input character has a confusable in.
291            // For this computation an input character is always considered to be
292            //    confusable with itself in its own script.
293            // If the number of such scripts is two or more, and the input consisted of
294            //   characters all from a single script, we have a whole script confusable.
295            //   (The two scripts will be the original script and the one that is confusable)
296            // If the number of such scripts >= one, and the original input contained characters from
297            //   more than one script, we have a mixed script confusable.  (We can transform
298            //   some of the characters, and end up with a visually similar string all in
299            //   one script.)
300
301            if (scriptCount == -1) {
302                int32_t t;
303                scriptCount = This->scriptScan(text, length, t, *status);
304            }
305
306            ScriptSet scripts;
307            This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);
308            int32_t confusableScriptCount = scripts.countMembers();
309            //printf("confusableScriptCount = %d\n", confusableScriptCount);
310
311            if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
312                confusableScriptCount >= 2 &&
313                scriptCount == 1) {
314                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
315            }
316
317            if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
318                confusableScriptCount >= 1 &&
319                scriptCount > 1) {
320                result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
321            }
322        }
323    }
324    if (position != NULL && failPos != 0x7fffffff) {
325        *position = failPos;
326    }
327    return result;
328}
329
330
331U_CAPI int32_t U_EXPORT2
332uspoof_checkUTF8(const USpoofChecker *sc,
333                 const char *text, int32_t length,
334                 int32_t *position,
335                 UErrorCode *status) {
336
337    if (U_FAILURE(*status)) {
338        return 0;
339    }
340    UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
341    UChar* text16 = stackBuf;
342    int32_t len16;
343
344    u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
345    if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
346        return 0;
347    }
348    if (*status == U_BUFFER_OVERFLOW_ERROR) {
349        text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
350        if (text16 == NULL) {
351            *status = U_MEMORY_ALLOCATION_ERROR;
352            return 0;
353        }
354        *status = U_ZERO_ERROR;
355        u_strFromUTF8(text16, len16+1, NULL, text, length, status);
356    }
357
358    int32_t position16 = -1;
359    int32_t result = uspoof_check(sc, text16, len16, &position16, status);
360    if (U_FAILURE(*status)) {
361        return 0;
362    }
363
364    if (position16 > 0) {
365        // Translate a UTF-16 based error position back to a UTF-8 offset.
366        // u_strToUTF8() in preflight mode is an easy way to do it.
367        U_ASSERT(position16 <= len16);
368        u_strToUTF8(NULL, 0, position, text16, position16, status);
369        if (position > 0) {
370            // position is the required buffer length from u_strToUTF8, which includes
371            // space for a terminating NULL, which we don't want, hence the -1.
372            *position -= 1;
373        }
374        *status = U_ZERO_ERROR;   // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
375    }
376
377    if (text16 != stackBuf) {
378        uprv_free(text16);
379    }
380    return result;
381
382}
383
384/*  A convenience wrapper around the public uspoof_getSkeleton that handles
385 *  allocating a larger buffer than provided if the original is too small.
386 */
387static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength,
388                         UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) {
389    int32_t requiredCapacity = 0;
390    UChar *buf = dest;
391
392    if (U_FAILURE(*status)) {
393        return NULL;
394    }
395    requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status);
396    if (*status == U_BUFFER_OVERFLOW_ERROR) {
397        buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar)));
398        if (buf == NULL) {
399            *status = U_MEMORY_ALLOCATION_ERROR;
400            return NULL;
401        }
402        *status = U_ZERO_ERROR;
403        uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status);
404    }
405    *outputLength = requiredCapacity;
406    return buf;
407}
408
409
410U_CAPI int32_t U_EXPORT2
411uspoof_areConfusable(const USpoofChecker *sc,
412                     const UChar *s1, int32_t length1,
413                     const UChar *s2, int32_t length2,
414                     UErrorCode *status) {
415    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
416    if (U_FAILURE(*status)) {
417        return 0;
418    }
419    //
420    // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
421    //   and for definitions of the types (single, whole, mixed-script) of confusables.
422
423    // We only care about a few of the check flags.  Ignore the others.
424    // If no tests relavant to this function have been specified, return an error.
425    // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
426    //        but logically we would just return 0 (no error).
427    if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
428                          USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
429        *status = U_INVALID_STATE_ERROR;
430        return 0;
431    }
432    int32_t  flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
433    UChar    s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
434    UChar   *s1Skeleton;
435    int32_t  s1SkeletonLength = 0;
436
437    UChar    s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
438    UChar   *s2Skeleton;
439    int32_t  s2SkeletonLength = 0;
440
441    int32_t  result = 0;
442    int32_t  t;
443    int32_t  s1ScriptCount = This->scriptScan(s1, length1, t, *status);
444    int32_t  s2ScriptCount = This->scriptScan(s2, length2, t, *status);
445
446    if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
447        // Do the Single Script compare.
448        if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
449            flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
450            s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
451                                     sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
452            s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
453                                     sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
454            if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
455                result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
456            }
457            if (s1Skeleton != s1SkeletonBuf) {
458                uprv_free(s1Skeleton);
459            }
460            if (s2Skeleton != s2SkeletonBuf) {
461                uprv_free(s2Skeleton);
462            }
463        }
464    }
465
466    if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
467         // If the two inputs are single script confusable they cannot also be
468         // mixed or whole script confusable, according to the UAX39 definitions.
469         // So we can skip those tests.
470         return result;
471    }
472
473    // Optimization for whole script confusables test:  two identifiers are whole script confusable if
474    // each is of a single script and they are mixed script confusable.
475    UBool possiblyWholeScriptConfusables =
476        s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
477
478    //
479    // Mixed Script Check
480    //
481    if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
482        // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
483        // the mixed script table skeleton, which is what we want.
484        // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
485        flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
486        s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
487                                 sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
488        s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
489                                 sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
490        if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
491            result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
492            if (possiblyWholeScriptConfusables) {
493                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
494            }
495        }
496        if (s1Skeleton != s1SkeletonBuf) {
497            uprv_free(s1Skeleton);
498        }
499        if (s2Skeleton != s2SkeletonBuf) {
500            uprv_free(s2Skeleton);
501        }
502    }
503
504    return result;
505}
506
507
508// Convenience function for converting a UTF-8 input to a UChar * string, including
509//          reallocating a buffer when required.  Parameters and their interpretation mostly
510//          match u_strFromUTF8.
511
512static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength,
513                               const char *in, int32_t inLength, UErrorCode *status) {
514    if (U_FAILURE(*status)) {
515        return NULL;
516    }
517    UChar *dest = outBuf;
518    u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);
519    if (*status == U_BUFFER_OVERFLOW_ERROR) {
520        dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar)));
521        if (dest == NULL) {
522            *status = U_MEMORY_ALLOCATION_ERROR;
523            return NULL;
524        }
525        *status = U_ZERO_ERROR;
526        u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);
527    }
528    return dest;
529}
530
531
532
533U_CAPI int32_t U_EXPORT2
534uspoof_areConfusableUTF8(const USpoofChecker *sc,
535                         const char *s1, int32_t length1,
536                         const char *s2, int32_t length2,
537                         UErrorCode *status) {
538
539    SpoofImpl::validateThis(sc, *status);
540    if (U_FAILURE(*status)) {
541        return 0;
542    }
543
544    UChar    s1Buf[USPOOF_STACK_BUFFER_SIZE];
545    int32_t  lengthS1U;
546    UChar   *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);
547
548    UChar    s2Buf[USPOOF_STACK_BUFFER_SIZE];
549    int32_t  lengthS2U;
550    UChar   *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);
551
552    int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status);
553
554    if (s1U != s1Buf) {
555        uprv_free(s1U);
556    }
557    if (s2U != s2Buf) {
558        uprv_free(s2U);
559    }
560    return results;
561}
562
563
564U_CAPI int32_t U_EXPORT2
565uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
566                                  const U_NAMESPACE_QUALIFIER UnicodeString &s1,
567                                  const U_NAMESPACE_QUALIFIER UnicodeString &s2,
568                                  UErrorCode *status) {
569
570    const UChar *u1  = s1.getBuffer();
571    int32_t  length1 = s1.length();
572    const UChar *u2  = s2.getBuffer();
573    int32_t  length2 = s2.length();
574
575    int32_t results  = uspoof_areConfusable(sc, u1, length1, u2, length2, status);
576    return results;
577}
578
579
580
581
582U_CAPI int32_t U_EXPORT2
583uspoof_checkUnicodeString(const USpoofChecker *sc,
584                          const U_NAMESPACE_QUALIFIER UnicodeString &text,
585                          int32_t *position,
586                          UErrorCode *status) {
587    int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
588    return result;
589}
590
591
592U_CAPI int32_t U_EXPORT2
593uspoof_getSkeleton(const USpoofChecker *sc,
594                   uint32_t type,
595                   const UChar *s,  int32_t length,
596                   UChar *dest, int32_t destCapacity,
597                   UErrorCode *status) {
598
599    // TODO:  this function could be sped up a bit
600    //        Skip the input normalization when not needed, work from callers data.
601    //        Put the initial skeleton straight into the caller's destination buffer.
602    //        It probably won't need normalization.
603    //        But these would make the structure more complicated.
604
605    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
606    if (U_FAILURE(*status)) {
607        return 0;
608    }
609    if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
610        (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
611        *status = U_ILLEGAL_ARGUMENT_ERROR;
612        return 0;
613    }
614
615   int32_t tableMask = 0;
616   switch (type) {
617      case 0:
618        tableMask = USPOOF_ML_TABLE_FLAG;
619        break;
620      case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
621        tableMask = USPOOF_SL_TABLE_FLAG;
622        break;
623      case USPOOF_ANY_CASE:
624        tableMask = USPOOF_MA_TABLE_FLAG;
625        break;
626      case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
627        tableMask = USPOOF_SA_TABLE_FLAG;
628        break;
629      default:
630        *status = U_ILLEGAL_ARGUMENT_ERROR;
631        return 0;
632    }
633
634    // NFKD transform of the user supplied input
635
636    UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE];
637    UChar *nfkdInput = nfkdStackBuf;
638    int32_t normalizedLen = unorm_normalize(
639        s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);
640    if (*status == U_BUFFER_OVERFLOW_ERROR) {
641        nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
642        if (nfkdInput == NULL) {
643            *status = U_MEMORY_ALLOCATION_ERROR;
644            return 0;
645        }
646        *status = U_ZERO_ERROR;
647        normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0,
648                                        nfkdInput, normalizedLen+1, status);
649    }
650    if (U_FAILURE(*status)) {
651        if (nfkdInput != nfkdStackBuf) {
652            uprv_free(nfkdInput);
653        }
654        return 0;
655    }
656
657    // buffer to hold the Unicode defined skeleton mappings for a single code point
658    UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
659
660    // Apply the skeleton mapping to the NFKD normalized input string
661    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
662    int32_t inputIndex = 0;
663    UnicodeString skelStr;
664    while (inputIndex < normalizedLen) {
665        UChar32 c;
666        U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);
667        int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
668        skelStr.append(buf, replaceLen);
669    }
670
671    if (nfkdInput != nfkdStackBuf) {
672        uprv_free(nfkdInput);
673    }
674
675    const UChar *result = skelStr.getBuffer();
676    int32_t  resultLen  = skelStr.length();
677    UChar   *normedResult = NULL;
678
679    // Check the skeleton for NFKD, normalize it if needed.
680    // Unnormalized results should be very rare.
681    if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {
682        normalizedLen = unorm_normalize(dest, resultLen, UNORM_NFKD, 0, NULL, 0, status);
683        UChar *normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
684        if (normedResult == NULL) {
685            *status = U_MEMORY_ALLOCATION_ERROR;
686            return 0;
687        }
688        unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status);
689        result = normedResult;
690        resultLen = normalizedLen;
691    }
692
693    // Copy the skeleton to the caller's buffer
694    if (U_SUCCESS(*status)) {
695        if (destCapacity == 0 || resultLen > destCapacity) {
696            *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
697        } else {
698            u_memcpy(dest, result, resultLen);
699            if (destCapacity > resultLen) {
700                dest[resultLen] = 0;
701            } else {
702                *status = U_STRING_NOT_TERMINATED_WARNING;
703            }
704        }
705     }
706     uprv_free(normedResult);
707     return resultLen;
708}
709
710
711
712U_CAPI UnicodeString &  U_EXPORT2
713uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
714                                uint32_t type,
715                                const UnicodeString &s,
716                                UnicodeString &dest,
717                                UErrorCode *status) {
718    if (U_FAILURE(*status)) {
719        return dest;
720    }
721    dest.remove();
722
723    const UChar *str = s.getBuffer();
724    int32_t      strLen = s.length();
725    UChar        smallBuf[USPOOF_STACK_BUFFER_SIZE];
726    UChar       *buf = smallBuf;
727    int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status);
728    if (*status == U_BUFFER_OVERFLOW_ERROR) {
729        buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar)));
730        if (buf == NULL) {
731            *status = U_MEMORY_ALLOCATION_ERROR;
732            return dest;
733        }
734        *status = U_ZERO_ERROR;
735        uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
736    }
737    if (U_SUCCESS(*status)) {
738        dest.setTo(buf, outputSize);
739    }
740
741    if (buf != smallBuf) {
742        uprv_free(buf);
743    }
744    return dest;
745}
746
747
748U_CAPI int32_t U_EXPORT2
749uspoof_getSkeletonUTF8(const USpoofChecker *sc,
750                       uint32_t type,
751                       const char *s,  int32_t length,
752                       char *dest, int32_t destCapacity,
753                       UErrorCode *status) {
754    // Lacking a UTF-8 normalization API, just converting the input to
755    // UTF-16 seems as good an approach as any.  In typical use, input will
756    // be an identifier, which is to say not too long for stack buffers.
757    if (U_FAILURE(*status)) {
758        return 0;
759    }
760    // Buffers for the UChar form of the input and skeleton strings.
761    UChar    smallInBuf[USPOOF_STACK_BUFFER_SIZE];
762    UChar   *inBuf = smallInBuf;
763    UChar    smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
764    UChar   *outBuf = smallOutBuf;
765
766    int32_t  lengthInUChars = 0;
767    int32_t  skelLengthInUChars = 0;
768    int32_t  skelLengthInUTF8 = 0;
769
770    u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
771                  s, length, status);
772    if (*status == U_BUFFER_OVERFLOW_ERROR) {
773        inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
774        if (inBuf == NULL) {
775            *status = U_MEMORY_ALLOCATION_ERROR;
776            goto cleanup;
777        }
778        *status = U_ZERO_ERROR;
779        u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars+1,
780                      s, length, status);
781    }
782
783    skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
784                                         outBuf, USPOOF_STACK_BUFFER_SIZE, status);
785    if (*status == U_BUFFER_OVERFLOW_ERROR) {
786        *status = U_ZERO_ERROR;
787        outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
788        if (outBuf == NULL) {
789            *status = U_MEMORY_ALLOCATION_ERROR;
790            goto cleanup;
791        }
792        skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
793                                         outBuf, USPOOF_STACK_BUFFER_SIZE, status);
794    }
795
796    u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
797                outBuf, skelLengthInUChars, status);
798
799  cleanup:
800    if (inBuf != smallInBuf) {
801        uprv_free(inBuf);
802    }
803    if (outBuf != smallOutBuf) {
804        uprv_free(outBuf);
805    }
806    return skelLengthInUTF8;
807}
808
809
810U_CAPI int32_t U_EXPORT2
811uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
812    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
813    if (This == NULL) {
814        U_ASSERT(U_FAILURE(*status));
815        return 0;
816    }
817    int32_t dataSize = This->fSpoofData->fRawData->fLength;
818    if (capacity < dataSize) {
819        *status = U_BUFFER_OVERFLOW_ERROR;
820        return dataSize;
821    }
822    uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
823    return dataSize;
824}
825
826#endif
827