1/*
2******************************************************************************
3*
4*   Copyright (C) 2008-2013, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  uspoof_wsconf.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009Jan05  (refactoring earlier files)
14*   created by: Andy Heninger
15*
16*   Internal functions for compililing Whole Script confusable source data
17*   into its binary (runtime) form.  The binary data format is described
18*   in uspoof_impl.h
19*/
20
21#include "unicode/utypes.h"
22#include "unicode/uspoof.h"
23
24#if !UCONFIG_NO_NORMALIZATION
25
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28#include "unicode/unorm.h"
29#include "unicode/uregex.h"
30#include "unicode/ustring.h"
31#include "cmemory.h"
32#include "scriptset.h"
33#include "uspoof_impl.h"
34#include "uhash.h"
35#include "uvector.h"
36#include "uassert.h"
37#include "uspoof_wsconf.h"
38
39U_NAMESPACE_USE
40
41
42// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
43// Example Lines:
44//   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
45//   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
46//    |               |     |    |
47//    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
48//    |               |     |----------Target script.   We need this.
49//    |               |----------------Src script.  Should match the script of the source
50//    |                                code points.  Beyond checking that, we don't keep it.
51//    |--------------------------------Source code points or range.
52//
53// The expression will match _all_ lines, including erroneous lines.
54// The result of the parse is returned via the contents of the (match) groups.
55static const char *parseExp =
56        "(?m)"                                         // Multi-line mode
57        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
58        "|^(?:"                                        //   OR
59        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
60        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
61        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
62        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
63        "[ \\t]*(?:#.*?)?"                             // Trailing commment
64        ")$|"                                          //   OR
65        "^(.*?)$";                                     // An error line.      Group 8.
66                                                       //    Any line not matching the preceding
67                                                       //    parts of the expression.will match
68                                                       //    this, and thus be flagged as an error
69
70
71// Extract a regular expression match group into a char * string.
72//    The group must contain only invariant characters.
73//    Used for script names
74//
75static void extractGroup(
76    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
77
78    UChar ubuf[50];
79    ubuf[0] = 0;
80    destBuf[0] = 0;
81    int32_t len = uregex_group(e, group, ubuf, 50, &status);
82    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
83        return;
84    }
85    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
86    s.extract(0, len, destBuf, destCapacity, US_INV);
87}
88
89
90
91U_NAMESPACE_BEGIN
92
93//  Build the Whole Script Confusable data
94//
95//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
96//                         because everything is local to this one build function anyhow,
97//                           OR
98//                         break this function into more reasonably sized pieces, with
99//                         state in WSConfusableDataBuilder.
100//
101void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
102          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
103{
104    if (U_FAILURE(status)) {
105        return;
106    }
107    URegularExpression *parseRegexp = NULL;
108    int32_t             inputLen    = 0;
109    UChar              *input       = NULL;
110    int32_t             lineNum     = 0;
111
112    UVector            *scriptSets        = NULL;
113    uint32_t            rtScriptSetsCount = 2;
114
115    UTrie2             *anyCaseTrie   = NULL;
116    UTrie2             *lowerCaseTrie = NULL;
117
118    anyCaseTrie = utrie2_open(0, 0, &status);
119    lowerCaseTrie = utrie2_open(0, 0, &status);
120
121    UnicodeString pattern(parseExp, -1, US_INV);
122
123    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
124    //
125    // Reserved TRIE values:
126    //   0:  Code point has no whole script confusables.
127    //   1:  Code point is of script Common or Inherited.
128    //       These code points do not participate in whole script confusable detection.
129    //       (This is logically equivalent to saying that they contain confusables in
130    //        all scripts)
131    //
132    // Because Trie values are indexes into the ScriptSets vector, pre-fill
133    // vector positions 0 and 1 to avoid conflicts with the reserved values.
134
135    scriptSets = new UVector(status);
136    if (scriptSets == NULL) {
137        status = U_MEMORY_ALLOCATION_ERROR;
138        goto cleanup;
139    }
140    scriptSets->addElement((void *)NULL, status);
141    scriptSets->addElement((void *)NULL, status);
142
143    // Convert the user input data from UTF-8 to UChar (UTF-16)
144    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
145    if (status != U_BUFFER_OVERFLOW_ERROR) {
146        goto cleanup;
147    }
148    status = U_ZERO_ERROR;
149    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
150    if (input == NULL) {
151        status = U_MEMORY_ALLOCATION_ERROR;
152        goto cleanup;
153    }
154    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
155
156    parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
157
158    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
159    //   given the syntax of the input.
160    if (*input == 0xfeff) {
161        *input = 0x20;
162    }
163
164    // Parse the input, one line per iteration of this loop.
165    uregex_setText(parseRegexp, input, inputLen, &status);
166    while (uregex_findNext(parseRegexp, &status)) {
167        lineNum++;
168        if (uregex_start(parseRegexp, 1, &status) >= 0) {
169            // this was a blank or comment line.
170            continue;
171        }
172        if (uregex_start(parseRegexp, 8, &status) >= 0) {
173            // input file syntax error.
174            status = U_PARSE_ERROR;
175            goto cleanup;
176        }
177        if (U_FAILURE(status)) {
178            goto cleanup;
179        }
180
181        // Pick up the start and optional range end code points from the parsed line.
182        UChar32  startCodePoint = SpoofImpl::ScanHex(
183            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
184        UChar32  endCodePoint = startCodePoint;
185        if (uregex_start(parseRegexp, 3, &status) >=0) {
186            endCodePoint = SpoofImpl::ScanHex(
187                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
188        }
189
190        // Extract the two script names from the source line.  We need these in an 8 bit
191        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
192        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
193        char  srcScriptName[20];
194        char  targScriptName[20];
195        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
196        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
197        UScriptCode srcScript  =
198            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
199        UScriptCode targScript =
200            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
201        if (U_FAILURE(status)) {
202            goto cleanup;
203        }
204        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
205            status = U_INVALID_FORMAT_ERROR;
206            goto cleanup;
207        }
208
209        // select the table - (A) any case or (L) lower case only
210        UTrie2 *table = anyCaseTrie;
211        if (uregex_start(parseRegexp, 7, &status) >= 0) {
212            table = lowerCaseTrie;
213        }
214
215        // Build the set of scripts containing confusable characters for
216        //   the code point(s) specified in this input line.
217        // Sanity check that the script of the source code point is the same
218        //   as the source script indicated in the input file.  Failure of this check is
219        //   an error in the input file.
220        // Include the source script in the set (needed for Mixed Script Confusable detection).
221        //
222        UChar32 cp;
223        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
224            int32_t setIndex = utrie2_get32(table, cp);
225            BuilderScriptSet *bsset = NULL;
226            if (setIndex > 0) {
227                U_ASSERT(setIndex < scriptSets->size());
228                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
229            } else {
230                bsset = new BuilderScriptSet();
231                if (bsset == NULL) {
232                    status = U_MEMORY_ALLOCATION_ERROR;
233                    goto cleanup;
234                }
235                bsset->codePoint = cp;
236                bsset->trie = table;
237                bsset->sset = new ScriptSet();
238                setIndex = scriptSets->size();
239                bsset->index = setIndex;
240                bsset->rindex = 0;
241                if (bsset->sset == NULL) {
242                    status = U_MEMORY_ALLOCATION_ERROR;
243                    goto cleanup;
244                }
245                scriptSets->addElement(bsset, status);
246                utrie2_set32(table, cp, setIndex, &status);
247            }
248            bsset->sset->set(targScript, status);
249            bsset->sset->set(srcScript, status);
250
251            if (U_FAILURE(status)) {
252                goto cleanup;
253            }
254            UScriptCode cpScript = uscript_getScript(cp, &status);
255            if (cpScript != srcScript) {
256                status = U_INVALID_FORMAT_ERROR;
257                goto cleanup;
258            }
259        }
260    }
261
262    // Eliminate duplicate script sets.  At this point we have a separate
263    // script set for every code point that had data in the input file.
264    //
265    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
266    //
267    // printf("Number of scriptSets: %d\n", scriptSets->size());
268    {
269        int32_t duplicateCount = 0;
270        rtScriptSetsCount = 2;
271        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
272            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
273            if (outerSet->index != static_cast<uint32_t>(outeri)) {
274                // This set was already identified as a duplicate.
275                //   It will not be allocated a position in the runtime array of ScriptSets.
276                continue;
277            }
278            outerSet->rindex = rtScriptSetsCount++;
279            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
280                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
281                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
282                    delete innerSet->sset;
283                    innerSet->scriptSetOwned = FALSE;
284                    innerSet->sset = outerSet->sset;
285                    innerSet->index = outeri;
286                    innerSet->rindex = outerSet->rindex;
287                    duplicateCount++;
288                }
289                // But this doesn't get all.  We need to fix the TRIE.
290            }
291        }
292        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
293    }
294
295
296
297    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
298    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
299    //     are unused, which is why the loop index starts at 2.)
300    {
301        for (int32_t i=2; i<scriptSets->size(); i++) {
302            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
303            if (bSet->rindex != (uint32_t)i) {
304                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
305            }
306        }
307    }
308
309    // For code points with script==Common or script==Inherited,
310    //   Set the reserved value of 1 into both Tries.  These characters do not participate
311    //   in Whole Script Confusable detection; this reserved value is the means
312    //   by which they are detected.
313    {
314        UnicodeSet ignoreSet;
315        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
316        UnicodeSet inheritedSet;
317        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
318        ignoreSet.addAll(inheritedSet);
319        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
320            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
321            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
322            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
323            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
324        }
325    }
326
327    // Serialize the data to the Spoof Detector
328    {
329        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
330        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
331        // printf("Any case Trie size: %d\n", size);
332        if (status != U_BUFFER_OVERFLOW_ERROR) {
333            goto cleanup;
334        }
335        status = U_ZERO_ERROR;
336        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
337        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
338        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
339        void *where = spImpl->fSpoofData->reserveSpace(size, status);
340        utrie2_serialize(anyCaseTrie, where, size, &status);
341
342        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
343        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
344        // printf("Lower case Trie size: %d\n", size);
345        if (status != U_BUFFER_OVERFLOW_ERROR) {
346            goto cleanup;
347        }
348        status = U_ZERO_ERROR;
349        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
350        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
351        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
352        where = spImpl->fSpoofData->reserveSpace(size, status);
353        utrie2_serialize(lowerCaseTrie, where, size, &status);
354
355        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
356        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
357        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
358            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
359        uint32_t rindex = 2;
360        for (int32_t i=2; i<scriptSets->size(); i++) {
361            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
362            if (bSet->rindex < rindex) {
363                // We have already copied this script set to the serialized data.
364                continue;
365            }
366            U_ASSERT(rindex == bSet->rindex);
367            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
368            rindex++;
369        }
370    }
371
372    // Open new utrie2s from the serialized data.  We don't want to keep the ones
373    //   we just built because we would then have two copies of the data, one internal to
374    //   the utries that we have already constructed, and one in the serialized data area.
375    //   An alternative would be to not pre-serialize the Trie data, but that makes the
376    //   spoof detector data different, depending on how the detector was constructed.
377    //   It's simpler to keep the data always the same.
378
379    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
380            UTRIE2_16_VALUE_BITS,
381            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
382            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
383            NULL,
384            &status);
385
386    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
387            UTRIE2_16_VALUE_BITS,
388            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
389            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
390            NULL,
391            &status);
392
393
394
395cleanup:
396    if (U_FAILURE(status)) {
397        pe->line = lineNum;
398    }
399    uregex_close(parseRegexp);
400    uprv_free(input);
401
402    int32_t i;
403    if (scriptSets != NULL) {
404        for (i=0; i<scriptSets->size(); i++) {
405            BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
406            delete bsset;
407        }
408        delete scriptSets;
409    }
410    utrie2_close(anyCaseTrie);
411    utrie2_close(lowerCaseTrie);
412    return;
413}
414
415U_NAMESPACE_END
416
417
418
419BuilderScriptSet::BuilderScriptSet() {
420    codePoint = -1;
421    trie = NULL;
422    sset = NULL;
423    index = 0;
424    rindex = 0;
425    scriptSetOwned = TRUE;
426}
427
428BuilderScriptSet::~BuilderScriptSet() {
429    if (scriptSetOwned) {
430        delete sset;
431    }
432}
433
434#endif
435#endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
436
437