127f654740f2a26ad62a5c155af9199af9e69b889claireho/*
227f654740f2a26ad62a5c155af9199af9e69b889claireho******************************************************************************
3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*
427f654740f2a26ad62a5c155af9199af9e69b889claireho*   Copyright (C) 2008-2009, International Business Machines
527f654740f2a26ad62a5c155af9199af9e69b889claireho*   Corporation and others.  All Rights Reserved.
627f654740f2a26ad62a5c155af9199af9e69b889claireho*
727f654740f2a26ad62a5c155af9199af9e69b889claireho******************************************************************************
827f654740f2a26ad62a5c155af9199af9e69b889claireho*   file name:  uspoof_wsconf.cpp
927f654740f2a26ad62a5c155af9199af9e69b889claireho*   encoding:   US-ASCII
1027f654740f2a26ad62a5c155af9199af9e69b889claireho*   tab size:   8 (not used)
1127f654740f2a26ad62a5c155af9199af9e69b889claireho*   indentation:4
1227f654740f2a26ad62a5c155af9199af9e69b889claireho*
1327f654740f2a26ad62a5c155af9199af9e69b889claireho*   created on: 2009Jan05  (refactoring earlier files)
1427f654740f2a26ad62a5c155af9199af9e69b889claireho*   created by: Andy Heninger
1527f654740f2a26ad62a5c155af9199af9e69b889claireho*
1627f654740f2a26ad62a5c155af9199af9e69b889claireho*   Internal functions for compililing Whole Script confusable source data
1727f654740f2a26ad62a5c155af9199af9e69b889claireho*   into its binary (runtime) form.  The binary data format is described
1827f654740f2a26ad62a5c155af9199af9e69b889claireho*   in uspoof_impl.h
1927f654740f2a26ad62a5c155af9199af9e69b889claireho*/
2027f654740f2a26ad62a5c155af9199af9e69b889claireho
2127f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/utypes.h"
2227f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/uspoof.h"
2327f654740f2a26ad62a5c155af9199af9e69b889claireho
2427f654740f2a26ad62a5c155af9199af9e69b889claireho#if !UCONFIG_NO_NORMALIZATION
2527f654740f2a26ad62a5c155af9199af9e69b889claireho
2627f654740f2a26ad62a5c155af9199af9e69b889claireho#if !UCONFIG_NO_REGULAR_EXPRESSIONS
2727f654740f2a26ad62a5c155af9199af9e69b889claireho
2827f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/unorm.h"
2927f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/uregex.h"
3027f654740f2a26ad62a5c155af9199af9e69b889claireho#include "unicode/ustring.h"
3127f654740f2a26ad62a5c155af9199af9e69b889claireho#include "cmemory.h"
3227f654740f2a26ad62a5c155af9199af9e69b889claireho#include "uspoof_impl.h"
3327f654740f2a26ad62a5c155af9199af9e69b889claireho#include "uhash.h"
3427f654740f2a26ad62a5c155af9199af9e69b889claireho#include "uvector.h"
3527f654740f2a26ad62a5c155af9199af9e69b889claireho#include "uassert.h"
3627f654740f2a26ad62a5c155af9199af9e69b889claireho#include "uspoof_wsconf.h"
3727f654740f2a26ad62a5c155af9199af9e69b889claireho
3827f654740f2a26ad62a5c155af9199af9e69b889clairehoU_NAMESPACE_USE
3927f654740f2a26ad62a5c155af9199af9e69b889claireho
4027f654740f2a26ad62a5c155af9199af9e69b889claireho
4127f654740f2a26ad62a5c155af9199af9e69b889claireho// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
4227f654740f2a26ad62a5c155af9199af9e69b889claireho// Example Lines:
4327f654740f2a26ad62a5c155af9199af9e69b889claireho//   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
4427f654740f2a26ad62a5c155af9199af9e69b889claireho//   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
4527f654740f2a26ad62a5c155af9199af9e69b889claireho//    |               |     |    |
4627f654740f2a26ad62a5c155af9199af9e69b889claireho//    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
4727f654740f2a26ad62a5c155af9199af9e69b889claireho//    |               |     |----------Target script.   We need this.
4827f654740f2a26ad62a5c155af9199af9e69b889claireho//    |               |----------------Src script.  Should match the script of the source
4927f654740f2a26ad62a5c155af9199af9e69b889claireho//    |                                code points.  Beyond checking that, we don't keep it.
5027f654740f2a26ad62a5c155af9199af9e69b889claireho//    |--------------------------------Source code points or range.
5127f654740f2a26ad62a5c155af9199af9e69b889claireho//
5227f654740f2a26ad62a5c155af9199af9e69b889claireho// The expression will match _all_ lines, including erroneous lines.
5327f654740f2a26ad62a5c155af9199af9e69b889claireho// The result of the parse is returned via the contents of the (match) groups.
5427f654740f2a26ad62a5c155af9199af9e69b889clairehostatic const char *parseExp =
5527f654740f2a26ad62a5c155af9199af9e69b889claireho
5627f654740f2a26ad62a5c155af9199af9e69b889claireho        "(?m)"                                         // Multi-line mode
5727f654740f2a26ad62a5c155af9199af9e69b889claireho        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
5827f654740f2a26ad62a5c155af9199af9e69b889claireho        "|^(?:"                                        //   OR
5927f654740f2a26ad62a5c155af9199af9e69b889claireho        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
6027f654740f2a26ad62a5c155af9199af9e69b889claireho        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
6127f654740f2a26ad62a5c155af9199af9e69b889claireho        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
6227f654740f2a26ad62a5c155af9199af9e69b889claireho        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
6327f654740f2a26ad62a5c155af9199af9e69b889claireho        "[ \\t]*(?:#.*?)?"                             // Trailing commment
6427f654740f2a26ad62a5c155af9199af9e69b889claireho        ")$|"                                          //   OR
6527f654740f2a26ad62a5c155af9199af9e69b889claireho        "^(.*?)$";                                     // An error line.      Group 8.
6627f654740f2a26ad62a5c155af9199af9e69b889claireho                                                       //    Any line not matching the preceding
67b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho                                                       //    parts of the expression.will match
6827f654740f2a26ad62a5c155af9199af9e69b889claireho                                                       //    this, and thus be flagged as an error
6927f654740f2a26ad62a5c155af9199af9e69b889claireho
7027f654740f2a26ad62a5c155af9199af9e69b889claireho
7127f654740f2a26ad62a5c155af9199af9e69b889claireho// Extract a regular expression match group into a char * string.
7227f654740f2a26ad62a5c155af9199af9e69b889claireho//    The group must contain only invariant characters.
7327f654740f2a26ad62a5c155af9199af9e69b889claireho//    Used for script names
7427f654740f2a26ad62a5c155af9199af9e69b889claireho//
7527f654740f2a26ad62a5c155af9199af9e69b889clairehostatic void extractGroup(
7627f654740f2a26ad62a5c155af9199af9e69b889claireho    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
7727f654740f2a26ad62a5c155af9199af9e69b889claireho
7827f654740f2a26ad62a5c155af9199af9e69b889claireho    UChar ubuf[50];
7927f654740f2a26ad62a5c155af9199af9e69b889claireho    ubuf[0] = 0;
8027f654740f2a26ad62a5c155af9199af9e69b889claireho    destBuf[0] = 0;
8127f654740f2a26ad62a5c155af9199af9e69b889claireho    int32_t len = uregex_group(e, group, ubuf, 50, &status);
8227f654740f2a26ad62a5c155af9199af9e69b889claireho    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
83        return;
84    }
85    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
86    s.extract(0, len, destBuf, destCapacity, US_INV);
87}
88
89
90
91//  Build the Whole Script Confusable data
92//
93//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
94//                         because everything is local to this one build function anyhow,
95//                           OR
96//                         break this function into more reasonably sized pieces, with
97//                         state in WSConfusableDataBuilder.
98//
99void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
100          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
101{
102    if (U_FAILURE(status)) {
103        return;
104    }
105    URegularExpression *parseRegexp = NULL;
106    int32_t             inputLen    = 0;
107    UChar              *input       = NULL;
108    int32_t             lineNum     = 0;
109
110    UVector            *scriptSets        = NULL;
111    uint32_t            rtScriptSetsCount = 2;
112
113    UTrie2             *anyCaseTrie   = NULL;
114    UTrie2             *lowerCaseTrie = NULL;
115
116    anyCaseTrie = utrie2_open(0, 0, &status);
117    lowerCaseTrie = utrie2_open(0, 0, &status);
118
119
120    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
121    //
122    // Reserved TRIE values:
123    //   0:  Code point has no whole script confusables.
124    //   1:  Code point is of script Common or Inherited.
125    //       These code points do not participate in whole script confusable detection.
126    //       (This is logically equivalent to saying that they contain confusables in
127    //        all scripts)
128    //
129    // Because Trie values are indexes into the ScriptSets vector, pre-fill
130    // vector positions 0 and 1 to avoid conflicts with the reserved values.
131
132    scriptSets = new UVector(status);
133    if (scriptSets == NULL) {
134        status = U_MEMORY_ALLOCATION_ERROR;
135        goto cleanup;
136    }
137    scriptSets->addElement((void *)NULL, status);
138    scriptSets->addElement((void *)NULL, status);
139
140    // Convert the user input data from UTF-8 to UChar (UTF-16)
141    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
142    if (status != U_BUFFER_OVERFLOW_ERROR) {
143        goto cleanup;
144    }
145    status = U_ZERO_ERROR;
146    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
147    if (input == NULL) {
148        status = U_MEMORY_ALLOCATION_ERROR;
149        goto cleanup;
150    }
151    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
152
153
154
155    parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
156
157    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
158    //   given the syntax of the input.
159    if (*input == 0xfeff) {
160        *input = 0x20;
161    }
162
163    // Parse the input, one line per iteration of this loop.
164    uregex_setText(parseRegexp, input, inputLen, &status);
165    while (uregex_findNext(parseRegexp, &status)) {
166        lineNum++;
167        UChar  line[200];
168        uregex_group(parseRegexp, 0, line, 200, &status);
169        if (uregex_start(parseRegexp, 1, &status) >= 0) {
170            // this was a blank or comment line.
171            continue;
172        }
173        if (uregex_start(parseRegexp, 8, &status) >= 0) {
174            // input file syntax error.
175            status = U_PARSE_ERROR;
176            goto cleanup;
177        }
178        if (U_FAILURE(status)) {
179            goto cleanup;
180        }
181
182        // Pick up the start and optional range end code points from the parsed line.
183        UChar32  startCodePoint = SpoofImpl::ScanHex(
184            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
185        UChar32  endCodePoint = startCodePoint;
186        if (uregex_start(parseRegexp, 3, &status) >=0) {
187            endCodePoint = SpoofImpl::ScanHex(
188                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
189        }
190
191        // Extract the two script names from the source line.  We need these in an 8 bit
192        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
193        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
194        char  srcScriptName[20];
195        char  targScriptName[20];
196        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
197        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
198        UScriptCode srcScript  =
199            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
200        UScriptCode targScript =
201            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
202        if (U_FAILURE(status)) {
203            goto cleanup;
204        }
205        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
206            status = U_INVALID_FORMAT_ERROR;
207            goto cleanup;
208        }
209
210        // select the table - (A) any case or (L) lower case only
211        UTrie2 *table = anyCaseTrie;
212        if (uregex_start(parseRegexp, 7, &status) >= 0) {
213            table = lowerCaseTrie;
214        }
215
216        // Build the set of scripts containing confusable characters for
217        //   the code point(s) specified in this input line.
218        // Sanity check that the script of the source code point is the same
219        //   as the source script indicated in the input file.  Failure of this check is
220        //   an error in the input file.
221        // Include the source script in the set (needed for Mixed Script Confusable detection).
222        //
223        UChar32 cp;
224        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
225            int32_t setIndex = utrie2_get32(table, cp);
226            BuilderScriptSet *bsset = NULL;
227            if (setIndex > 0) {
228                U_ASSERT(setIndex < scriptSets->size());
229                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
230            } else {
231                bsset = new BuilderScriptSet();
232                if (bsset == NULL) {
233                    status = U_MEMORY_ALLOCATION_ERROR;
234                    goto cleanup;
235                }
236                bsset->codePoint = cp;
237                bsset->trie = table;
238                bsset->sset = new ScriptSet();
239                setIndex = scriptSets->size();
240                bsset->index = setIndex;
241                bsset->rindex = 0;
242                if (bsset->sset == NULL) {
243                    status = U_MEMORY_ALLOCATION_ERROR;
244                    goto cleanup;
245                }
246                scriptSets->addElement(bsset, status);
247                utrie2_set32(table, cp, setIndex, &status);
248            }
249            bsset->sset->Union(targScript);
250            bsset->sset->Union(srcScript);
251
252            if (U_FAILURE(status)) {
253                goto cleanup;
254            }
255            UScriptCode cpScript = uscript_getScript(cp, &status);
256            if (cpScript != srcScript) {
257                status = U_INVALID_FORMAT_ERROR;
258                goto cleanup;
259            }
260        }
261    }
262
263    // Eliminate duplicate script sets.  At this point we have a separate
264    // script set for every code point that had data in the input file.
265    //
266    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
267    //
268    // printf("Number of scriptSets: %d\n", scriptSets->size());
269    {
270        int32_t duplicateCount = 0;
271        rtScriptSetsCount = 2;
272        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
273            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
274            if (outerSet->index != static_cast<uint32_t>(outeri)) {
275                // This set was already identified as a duplicate.
276                //   It will not be allocated a position in the runtime array of ScriptSets.
277                continue;
278            }
279            outerSet->rindex = rtScriptSetsCount++;
280            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
281                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
282                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
283                    delete innerSet->sset;
284                    innerSet->scriptSetOwned = FALSE;
285                    innerSet->sset = outerSet->sset;
286                    innerSet->index = outeri;
287                    innerSet->rindex = outerSet->rindex;
288                    duplicateCount++;
289                }
290                // But this doesn't get all.  We need to fix the TRIE.
291            }
292        }
293        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
294    }
295
296
297
298    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
299    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
300    //     are unused, which is why the loop index starts at 2.)
301    {
302        for (int32_t i=2; i<scriptSets->size(); i++) {
303            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
304            if (bSet->rindex != (uint32_t)i) {
305                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
306            }
307        }
308    }
309
310    // For code points with script==Common or script==Inherited,
311    //   Set the reserved value of 1 into both Tries.  These characters do not participate
312    //   in Whole Script Confusable detection; this reserved value is the means
313    //   by which they are detected.
314    {
315        UnicodeSet ignoreSet;
316        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
317        UnicodeSet inheritedSet;
318        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
319        ignoreSet.addAll(inheritedSet);
320        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
321            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
322            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
323            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
324            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
325        }
326    }
327
328    // Serialize the data to the Spoof Detector
329    {
330        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
331        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
332        // printf("Any case Trie size: %d\n", size);
333        if (status != U_BUFFER_OVERFLOW_ERROR) {
334            goto cleanup;
335        }
336        status = U_ZERO_ERROR;
337        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
338        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
339        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
340        void *where = spImpl->fSpoofData->reserveSpace(size, status);
341        utrie2_serialize(anyCaseTrie, where, size, &status);
342
343        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
344        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
345        // printf("Lower case Trie size: %d\n", size);
346        if (status != U_BUFFER_OVERFLOW_ERROR) {
347            goto cleanup;
348        }
349        status = U_ZERO_ERROR;
350        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
351        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
352        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
353        where = spImpl->fSpoofData->reserveSpace(size, status);
354        utrie2_serialize(lowerCaseTrie, where, size, &status);
355
356        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
357        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
358        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
359            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
360        uint32_t rindex = 2;
361        for (int32_t i=2; i<scriptSets->size(); i++) {
362            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
363            if (bSet->rindex < rindex) {
364                // We have already copied this script set to the serialized data.
365                continue;
366            }
367            U_ASSERT(rindex == bSet->rindex);
368            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
369            rindex++;
370        }
371    }
372
373    // Open new utrie2s from the serialized data.  We don't want to keep the ones
374    //   we just built because we would then have two copies of the data, one internal to
375    //   the utries that we have already constructed, and one in the serialized data area.
376    //   An alternative would be to not pre-serialize the Trie data, but that makes the
377    //   spoof detector data different, depending on how the detector was constructed.
378    //   It's simpler to keep the data always the same.
379
380    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
381            UTRIE2_16_VALUE_BITS,
382            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
383            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
384            NULL,
385            &status);
386
387    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
388            UTRIE2_16_VALUE_BITS,
389            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
390            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
391            NULL,
392            &status);
393
394
395
396cleanup:
397    if (U_FAILURE(status)) {
398        pe->line = lineNum;
399    }
400    uregex_close(parseRegexp);
401    uprv_free(input);
402
403    int32_t i;
404    for (i=0; i<scriptSets->size(); i++) {
405        BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
406        delete bsset;
407    }
408    delete scriptSets;
409    utrie2_close(anyCaseTrie);
410    utrie2_close(lowerCaseTrie);
411    return;
412}
413
414
415
416
417
418BuilderScriptSet::BuilderScriptSet() {
419    codePoint = -1;
420    trie = NULL;
421    sset = NULL;
422    index = 0;
423    rindex = 0;
424    scriptSetOwned = TRUE;
425}
426
427BuilderScriptSet::~BuilderScriptSet() {
428    if (scriptSetOwned) {
429        delete sset;
430    }
431}
432
433#endif
434#endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
435
436