16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org******************************************************************************
36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Copyright (C) 2008-2013, International Business Machines
56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Corporation and others.  All Rights Reserved.
66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org******************************************************************************
86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   file name:  uspoof_wsconf.cpp
96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   encoding:   US-ASCII
106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   tab size:   8 (not used)
116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   indentation:4
126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   created on: 2009Jan05  (refactoring earlier files)
146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   created by: Andy Heninger
156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Internal functions for compililing Whole Script confusable source data
176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   into its binary (runtime) form.  The binary data format is described
186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   in uspoof_impl.h
196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h"
226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uspoof.h"
236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_NORMALIZATION
256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_REGULAR_EXPRESSIONS
276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/unorm.h"
296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uregex.h"
306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ustring.h"
316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cmemory.h"
326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "scriptset.h"
336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uspoof_impl.h"
346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uhash.h"
356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uvector.h"
366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uassert.h"
376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uspoof_wsconf.h"
386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_USE
406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Example Lines:
446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//    |               |     |    |
476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//    |               |     |----------Target script.   We need this.
496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//    |               |----------------Src script.  Should match the script of the source
506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//    |                                code points.  Beyond checking that, we don't keep it.
516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//    |--------------------------------Source code points or range.
526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//
536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// The expression will match _all_ lines, including erroneous lines.
546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// The result of the parse is returned via the contents of the (match) groups.
556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic const char *parseExp =
566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "(?m)"                                         // Multi-line mode
576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "|^(?:"                                        //   OR
596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "[ \\t]*(?:#.*?)?"                             // Trailing commment
646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        ")$|"                                          //   OR
656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        "^(.*?)$";                                     // An error line.      Group 8.
666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                                       //    Any line not matching the preceding
676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                                       //    parts of the expression.will match
686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                                       //    this, and thus be flagged as an error
696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Extract a regular expression match group into a char * string.
726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//    The group must contain only invariant characters.
736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//    Used for script names
746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//
756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic void extractGroup(
766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar ubuf[50];
796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ubuf[0] = 0;
806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    destBuf[0] = 0;
816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t len = uregex_group(e, group, ubuf, 50, &status);
826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return;
846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    s.extract(0, len, destBuf, destCapacity, US_INV);
876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN
926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//  Build the Whole Script Confusable data
946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//
956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//                         because everything is local to this one build function anyhow,
976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//                           OR
986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//                         break this function into more reasonably sized pieces, with
996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//                         state in WSConfusableDataBuilder.
1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org//
1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{
1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status)) {
1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return;
1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    URegularExpression *parseRegexp = NULL;
1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t             inputLen    = 0;
1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar              *input       = NULL;
1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t             lineNum     = 0;
1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UVector            *scriptSets        = NULL;
1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uint32_t            rtScriptSetsCount = 2;
1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UTrie2             *anyCaseTrie   = NULL;
1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UTrie2             *lowerCaseTrie = NULL;
1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    anyCaseTrie = utrie2_open(0, 0, &status);
1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    lowerCaseTrie = utrie2_open(0, 0, &status);
1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString pattern(parseExp, -1, US_INV);
1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //
1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Reserved TRIE values:
1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   0:  Code point has no whole script confusables.
1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   1:  Code point is of script Common or Inherited.
1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //       These code points do not participate in whole script confusable detection.
1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //       (This is logically equivalent to saying that they contain confusables in
1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //        all scripts)
1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //
1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Because Trie values are indexes into the ScriptSets vector, pre-fill
1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // vector positions 0 and 1 to avoid conflicts with the reserved values.
1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    scriptSets = new UVector(status);
1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (scriptSets == NULL) {
1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        status = U_MEMORY_ALLOCATION_ERROR;
1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto cleanup;
1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    scriptSets->addElement((void *)NULL, status);
1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    scriptSets->addElement((void *)NULL, status);
1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Convert the user input data from UTF-8 to UChar (UTF-16)
1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (status != U_BUFFER_OVERFLOW_ERROR) {
1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto cleanup;
1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    status = U_ZERO_ERROR;
1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (input == NULL) {
1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        status = U_MEMORY_ALLOCATION_ERROR;
1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        goto cleanup;
1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   given the syntax of the input.
1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (*input == 0xfeff) {
1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        *input = 0x20;
1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Parse the input, one line per iteration of this loop.
1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uregex_setText(parseRegexp, input, inputLen, &status);
1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    while (uregex_findNext(parseRegexp, &status)) {
1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        lineNum++;
1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (uregex_start(parseRegexp, 1, &status) >= 0) {
1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // this was a blank or comment line.
1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            continue;
1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (uregex_start(parseRegexp, 8, &status) >= 0) {
1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // input file syntax error.
1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            status = U_PARSE_ERROR;
1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            goto cleanup;
1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (U_FAILURE(status)) {
1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            goto cleanup;
1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Pick up the start and optional range end code points from the parsed line.
1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UChar32  startCodePoint = SpoofImpl::ScanHex(
1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UChar32  endCodePoint = startCodePoint;
1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (uregex_start(parseRegexp, 3, &status) >=0) {
1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            endCodePoint = SpoofImpl::ScanHex(
1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Extract the two script names from the source line.  We need these in an 8 bit
1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        char  srcScriptName[20];
1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        char  targScriptName[20];
1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UScriptCode srcScript  =
1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UScriptCode targScript =
2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (U_FAILURE(status)) {
2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            goto cleanup;
2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            status = U_INVALID_FORMAT_ERROR;
2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            goto cleanup;
2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // select the table - (A) any case or (L) lower case only
2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UTrie2 *table = anyCaseTrie;
2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (uregex_start(parseRegexp, 7, &status) >= 0) {
2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            table = lowerCaseTrie;
2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Build the set of scripts containing confusable characters for
2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        //   the code point(s) specified in this input line.
2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Sanity check that the script of the source code point is the same
2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        //   as the source script indicated in the input file.  Failure of this check is
2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        //   an error in the input file.
2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Include the source script in the set (needed for Mixed Script Confusable detection).
2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        //
2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UChar32 cp;
2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            int32_t setIndex = utrie2_get32(table, cp);
2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            BuilderScriptSet *bsset = NULL;
2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (setIndex > 0) {
2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                U_ASSERT(setIndex < scriptSets->size());
2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            } else {
2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bsset = new BuilderScriptSet();
2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if (bsset == NULL) {
2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    status = U_MEMORY_ALLOCATION_ERROR;
2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    goto cleanup;
2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bsset->codePoint = cp;
2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bsset->trie = table;
2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bsset->sset = new ScriptSet();
2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                setIndex = scriptSets->size();
2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bsset->index = setIndex;
2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                bsset->rindex = 0;
2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if (bsset->sset == NULL) {
2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    status = U_MEMORY_ALLOCATION_ERROR;
2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    goto cleanup;
2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                scriptSets->addElement(bsset, status);
2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                utrie2_set32(table, cp, setIndex, &status);
2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            bsset->sset->set(targScript, status);
2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            bsset->sset->set(srcScript, status);
2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (U_FAILURE(status)) {
2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                goto cleanup;
2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            UScriptCode cpScript = uscript_getScript(cp, &status);
2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (cpScript != srcScript) {
2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                status = U_INVALID_FORMAT_ERROR;
2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                goto cleanup;
2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Eliminate duplicate script sets.  At this point we have a separate
2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // script set for every code point that had data in the input file.
2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //
2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //
2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // printf("Number of scriptSets: %d\n", scriptSets->size());
2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    {
2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        int32_t duplicateCount = 0;
2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        rtScriptSetsCount = 2;
2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (outerSet->index != static_cast<uint32_t>(outeri)) {
2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // This set was already identified as a duplicate.
2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                //   It will not be allocated a position in the runtime array of ScriptSets.
2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                continue;
2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            outerSet->rindex = rtScriptSetsCount++;
2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    delete innerSet->sset;
2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    innerSet->scriptSetOwned = FALSE;
2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    innerSet->sset = outerSet->sset;
2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    innerSet->index = outeri;
2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    innerSet->rindex = outerSet->rindex;
2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                    duplicateCount++;
2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                }
2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // But this doesn't get all.  We need to fix the TRIE.
2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //     are unused, which is why the loop index starts at 2.)
3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    {
3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for (int32_t i=2; i<scriptSets->size(); i++) {
3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (bSet->rindex != (uint32_t)i) {
3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // For code points with script==Common or script==Inherited,
3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   Set the reserved value of 1 into both Tries.  These characters do not participate
3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   in Whole Script Confusable detection; this reserved value is the means
3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   by which they are detected.
3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    {
3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UnicodeSet ignoreSet;
3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UnicodeSet inheritedSet;
3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        ignoreSet.addAll(inheritedSet);
3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Serialize the data to the Spoof Detector
3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    {
3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // printf("Any case Trie size: %d\n", size);
3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (status != U_BUFFER_OVERFLOW_ERROR) {
3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            goto cleanup;
3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        status = U_ZERO_ERROR;
3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        void *where = spImpl->fSpoofData->reserveSpace(size, status);
3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        utrie2_serialize(anyCaseTrie, where, size, &status);
3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // printf("Lower case Trie size: %d\n", size);
3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if (status != U_BUFFER_OVERFLOW_ERROR) {
3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            goto cleanup;
3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        status = U_ZERO_ERROR;
3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        where = spImpl->fSpoofData->reserveSpace(size, status);
3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        utrie2_serialize(lowerCaseTrie, where, size, &status);
3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        uint32_t rindex = 2;
3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for (int32_t i=2; i<scriptSets->size(); i++) {
3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            if (bSet->rindex < rindex) {
3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                // We have already copied this script set to the serialized data.
3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                continue;
3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            }
3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            U_ASSERT(rindex == bSet->rindex);
3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            rindex++;
3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Open new utrie2s from the serialized data.  We don't want to keep the ones
3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   we just built because we would then have two copies of the data, one internal to
3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   the utries that we have already constructed, and one in the serialized data area.
3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   An alternative would be to not pre-serialize the Trie data, but that makes the
3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   spoof detector data different, depending on how the detector was constructed.
3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    //   It's simpler to keep the data always the same.
3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            UTRIE2_16_VALUE_BITS,
3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            NULL,
3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            &status);
3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            UTRIE2_16_VALUE_BITS,
3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            NULL,
3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            &status);
3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgcleanup:
3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (U_FAILURE(status)) {
3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        pe->line = lineNum;
3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uregex_close(parseRegexp);
4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    uprv_free(input);
4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t i;
4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (scriptSets != NULL) {
4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        for (i=0; i<scriptSets->size(); i++) {
4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            delete bsset;
4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        delete scriptSets;
4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    utrie2_close(anyCaseTrie);
4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    utrie2_close(lowerCaseTrie);
4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return;
4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END
4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBuilderScriptSet::BuilderScriptSet() {
4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    codePoint = -1;
4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    trie = NULL;
4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    sset = NULL;
4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    index = 0;
4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    rindex = 0;
4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    scriptSetOwned = TRUE;
4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBuilderScriptSet::~BuilderScriptSet() {
4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if (scriptSetOwned) {
4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        delete sset;
4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif
4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
437