1b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru/*
2b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru******************************************************************************
3b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*
48393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius*   Copyright (C) 2008-2013, International Business Machines
5b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
6b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*
7b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru******************************************************************************
850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   file name:  uspoof_wsconf.cpp
9b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   encoding:   US-ASCII
10b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   tab size:   8 (not used)
11b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   indentation:4
12b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*
13b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   created on: 2009Jan05  (refactoring earlier files)
14b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   created by: Andy Heninger
15b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*
16b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   Internal functions for compililing Whole Script confusable source data
17b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   into its binary (runtime) form.  The binary data format is described
18b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*   in uspoof_impl.h
19b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*/
20b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
21b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/utypes.h"
22b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uspoof.h"
23b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
24b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if !UCONFIG_NO_NORMALIZATION
25b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
26b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
28b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/unorm.h"
29b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uregex.h"
30b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/ustring.h"
31b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "cmemory.h"
328393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius#include "scriptset.h"
33b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uspoof_impl.h"
34b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uhash.h"
35b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uvector.h"
36b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uassert.h"
3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uspoof_wsconf.h"
38b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
39b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_NAMESPACE_USE
40b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
41b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
42b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
43b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// Example Lines:
44b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
45b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
46b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//    |               |     |    |
47b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
48b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//    |               |     |----------Target script.   We need this.
49b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//    |               |----------------Src script.  Should match the script of the source
50b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//    |                                code points.  Beyond checking that, we don't keep it.
51b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//    |--------------------------------Source code points or range.
52b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//
53b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// The expression will match _all_ lines, including erroneous lines.
54b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// The result of the parse is returned via the contents of the (match) groups.
55b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querustatic const char *parseExp =
56b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "(?m)"                                         // Multi-line mode
57b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
58b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "|^(?:"                                        //   OR
59b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
60b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
61b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
62b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
63b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "[ \\t]*(?:#.*?)?"                             // Trailing commment
64b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        ")$|"                                          //   OR
65b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        "^(.*?)$";                                     // An error line.      Group 8.
66b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                                                       //    Any line not matching the preceding
67b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                                                       //    parts of the expression.will match
68b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                                                       //    this, and thus be flagged as an error
69b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
70b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
71b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// Extract a regular expression match group into a char * string.
72b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//    The group must contain only invariant characters.
73b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//    Used for script names
74b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//
75b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querustatic void extractGroup(
76b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
77b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
78b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UChar ubuf[50];
79b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ubuf[0] = 0;
80b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    destBuf[0] = 0;
81b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    int32_t len = uregex_group(e, group, ubuf, 50, &status);
82b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
83b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        return;
84b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
85b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
86b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    s.extract(0, len, destBuf, destCapacity, US_INV);
87b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru}
88b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
89b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
90b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
91103e9ffba2cba345d0078eb8b8db33249f81840aCraig CorneliusU_NAMESPACE_BEGIN
92103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius
93b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//  Build the Whole Script Confusable data
94b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//
95b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
96b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//                         because everything is local to this one build function anyhow,
97b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//                           OR
98b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//                         break this function into more reasonably sized pieces, with
99b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//                         state in WSConfusableDataBuilder.
100b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru//
101b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
102b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
103b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{
104b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (U_FAILURE(status)) {
105b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        return;
106b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
107b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    URegularExpression *parseRegexp = NULL;
108b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    int32_t             inputLen    = 0;
109b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UChar              *input       = NULL;
110b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    int32_t             lineNum     = 0;
111b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
112b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UVector            *scriptSets        = NULL;
113b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    uint32_t            rtScriptSetsCount = 2;
114b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
115b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UTrie2             *anyCaseTrie   = NULL;
116b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    UTrie2             *lowerCaseTrie = NULL;
117b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
118b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    anyCaseTrie = utrie2_open(0, 0, &status);
119b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    lowerCaseTrie = utrie2_open(0, 0, &status);
120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho
121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    UnicodeString pattern(parseExp, -1, US_INV);
122b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
123b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
124b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //
125b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Reserved TRIE values:
126b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   0:  Code point has no whole script confusables.
127b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   1:  Code point is of script Common or Inherited.
128b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //       These code points do not participate in whole script confusable detection.
129b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //       (This is logically equivalent to saying that they contain confusables in
130b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //        all scripts)
131b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //
132b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Because Trie values are indexes into the ScriptSets vector, pre-fill
133b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // vector positions 0 and 1 to avoid conflicts with the reserved values.
134b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
135b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    scriptSets = new UVector(status);
136b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (scriptSets == NULL) {
137b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        status = U_MEMORY_ALLOCATION_ERROR;
138b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        goto cleanup;
139b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
140b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    scriptSets->addElement((void *)NULL, status);
141b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    scriptSets->addElement((void *)NULL, status);
142b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
143b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Convert the user input data from UTF-8 to UChar (UTF-16)
144b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
145b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (status != U_BUFFER_OVERFLOW_ERROR) {
146b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        goto cleanup;
147b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
148b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    status = U_ZERO_ERROR;
149b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
150b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (input == NULL) {
151b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        status = U_MEMORY_ALLOCATION_ERROR;
152b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        goto cleanup;
153b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
154b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
155b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
156b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho    parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
157b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
158b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
159b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   given the syntax of the input.
160b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (*input == 0xfeff) {
161b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        *input = 0x20;
162b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
163b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
164b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Parse the input, one line per iteration of this loop.
165b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    uregex_setText(parseRegexp, input, inputLen, &status);
166b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    while (uregex_findNext(parseRegexp, &status)) {
167b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        lineNum++;
168b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (uregex_start(parseRegexp, 1, &status) >= 0) {
169b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            // this was a blank or comment line.
170b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            continue;
171b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
172b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (uregex_start(parseRegexp, 8, &status) >= 0) {
173b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            // input file syntax error.
174b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            status = U_PARSE_ERROR;
175b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            goto cleanup;
176b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
177b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (U_FAILURE(status)) {
178b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            goto cleanup;
179b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
180b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
181b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // Pick up the start and optional range end code points from the parsed line.
182b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UChar32  startCodePoint = SpoofImpl::ScanHex(
183b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
184b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UChar32  endCodePoint = startCodePoint;
185b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (uregex_start(parseRegexp, 3, &status) >=0) {
186b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            endCodePoint = SpoofImpl::ScanHex(
187b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
188b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
189b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
190b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // Extract the two script names from the source line.  We need these in an 8 bit
191b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
192b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
193b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        char  srcScriptName[20];
194b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        char  targScriptName[20];
195b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
196b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
197b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UScriptCode srcScript  =
198b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
199b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UScriptCode targScript =
200b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
201b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (U_FAILURE(status)) {
202b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            goto cleanup;
203b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
204b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
205b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            status = U_INVALID_FORMAT_ERROR;
206b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            goto cleanup;
207b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
208b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
209b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // select the table - (A) any case or (L) lower case only
210b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UTrie2 *table = anyCaseTrie;
211b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (uregex_start(parseRegexp, 7, &status) >= 0) {
212b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            table = lowerCaseTrie;
213b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
214b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
215b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // Build the set of scripts containing confusable characters for
216b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        //   the code point(s) specified in this input line.
217b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // Sanity check that the script of the source code point is the same
218b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        //   as the source script indicated in the input file.  Failure of this check is
219b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        //   an error in the input file.
220b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // Include the source script in the set (needed for Mixed Script Confusable detection).
221b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        //
222b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UChar32 cp;
223b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
224b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            int32_t setIndex = utrie2_get32(table, cp);
225b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            BuilderScriptSet *bsset = NULL;
226b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (setIndex > 0) {
227b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                U_ASSERT(setIndex < scriptSets->size());
228b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
229b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            } else {
230b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                bsset = new BuilderScriptSet();
231b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                if (bsset == NULL) {
232b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    status = U_MEMORY_ALLOCATION_ERROR;
233b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    goto cleanup;
234b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                }
235b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                bsset->codePoint = cp;
236b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                bsset->trie = table;
237b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                bsset->sset = new ScriptSet();
238b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                setIndex = scriptSets->size();
239b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                bsset->index = setIndex;
240b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                bsset->rindex = 0;
241b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                if (bsset->sset == NULL) {
242b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    status = U_MEMORY_ALLOCATION_ERROR;
243b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    goto cleanup;
244b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                }
245b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                scriptSets->addElement(bsset, status);
246b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                utrie2_set32(table, cp, setIndex, &status);
247b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            }
2488393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius            bsset->sset->set(targScript, status);
2498393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius            bsset->sset->set(srcScript, status);
250b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
251b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (U_FAILURE(status)) {
252b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                goto cleanup;
253b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            }
254b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            UScriptCode cpScript = uscript_getScript(cp, &status);
255b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (cpScript != srcScript) {
256b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                status = U_INVALID_FORMAT_ERROR;
257b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                goto cleanup;
258b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            }
259b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
260b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
261b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
262b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Eliminate duplicate script sets.  At this point we have a separate
263b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // script set for every code point that had data in the input file.
264b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //
265b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
266b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //
267b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // printf("Number of scriptSets: %d\n", scriptSets->size());
268b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    {
269b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        int32_t duplicateCount = 0;
270b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        rtScriptSetsCount = 2;
271b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
272b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
273b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (outerSet->index != static_cast<uint32_t>(outeri)) {
274b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                // This set was already identified as a duplicate.
275b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                //   It will not be allocated a position in the runtime array of ScriptSets.
276b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                continue;
277b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            }
278b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            outerSet->rindex = rtScriptSetsCount++;
279b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
280b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
281b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
282b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    delete innerSet->sset;
283b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    innerSet->scriptSetOwned = FALSE;
284b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    innerSet->sset = outerSet->sset;
285b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    innerSet->index = outeri;
286b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    innerSet->rindex = outerSet->rindex;
287b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                    duplicateCount++;
288b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                }
289b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                // But this doesn't get all.  We need to fix the TRIE.
290b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            }
291b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
292b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
293b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
294b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
295b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
296b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
297b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
298b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
299b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //     are unused, which is why the loop index starts at 2.)
300b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    {
301b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        for (int32_t i=2; i<scriptSets->size(); i++) {
302b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
303b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (bSet->rindex != (uint32_t)i) {
304b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
305b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            }
306b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
307b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
308b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
309b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // For code points with script==Common or script==Inherited,
310b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   Set the reserved value of 1 into both Tries.  These characters do not participate
311b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   in Whole Script Confusable detection; this reserved value is the means
312b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   by which they are detected.
313b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    {
314b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UnicodeSet ignoreSet;
315b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
316b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        UnicodeSet inheritedSet;
317b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
318b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        ignoreSet.addAll(inheritedSet);
319b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
320b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
321b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
322b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
323b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
324b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
325b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
326b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
327b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Serialize the data to the Spoof Detector
328b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    {
329b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
330b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
331b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // printf("Any case Trie size: %d\n", size);
332b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (status != U_BUFFER_OVERFLOW_ERROR) {
333b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            goto cleanup;
334b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
335b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        status = U_ZERO_ERROR;
336b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
337b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
338b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
339b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        void *where = spImpl->fSpoofData->reserveSpace(size, status);
340b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        utrie2_serialize(anyCaseTrie, where, size, &status);
341b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
342b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
343b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
344b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        // printf("Lower case Trie size: %d\n", size);
345b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        if (status != U_BUFFER_OVERFLOW_ERROR) {
346b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            goto cleanup;
347b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
348b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        status = U_ZERO_ERROR;
349b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
350b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
351b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
352b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        where = spImpl->fSpoofData->reserveSpace(size, status);
353b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        utrie2_serialize(lowerCaseTrie, where, size, &status);
354b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
355b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
356b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
357b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
358b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
359b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        uint32_t rindex = 2;
360b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        for (int32_t i=2; i<scriptSets->size(); i++) {
361b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
362b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            if (bSet->rindex < rindex) {
363b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                // We have already copied this script set to the serialized data.
364b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                continue;
365b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            }
366b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            U_ASSERT(rindex == bSet->rindex);
367b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
368b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            rindex++;
369b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        }
370b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
371b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
372b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Open new utrie2s from the serialized data.  We don't want to keep the ones
373b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   we just built because we would then have two copies of the data, one internal to
374b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   the utries that we have already constructed, and one in the serialized data area.
375b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   An alternative would be to not pre-serialize the Trie data, but that makes the
376b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   spoof detector data different, depending on how the detector was constructed.
377b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    //   It's simpler to keep the data always the same.
378b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
379b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
380b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            UTRIE2_16_VALUE_BITS,
381b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
382b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
383b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            NULL,
384b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            &status);
385b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
386b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
387b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            UTRIE2_16_VALUE_BITS,
388b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
389b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
390b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            NULL,
391b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            &status);
392b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
393b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
394b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
395b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querucleanup:
396b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (U_FAILURE(status)) {
397b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        pe->line = lineNum;
398b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
399b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    uregex_close(parseRegexp);
400b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    uprv_free(input);
401b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
402b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    int32_t i;
403103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius    if (scriptSets != NULL) {
404103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius        for (i=0; i<scriptSets->size(); i++) {
405103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius            BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
406103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius            delete bsset;
407103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius        }
408103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius        delete scriptSets;
409b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
410b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    utrie2_close(anyCaseTrie);
411b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    utrie2_close(lowerCaseTrie);
412b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    return;
413b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru}
414b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
415103e9ffba2cba345d0078eb8b8db33249f81840aCraig CorneliusU_NAMESPACE_END
416b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
417b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
418b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
419b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruBuilderScriptSet::BuilderScriptSet() {
420b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    codePoint = -1;
421b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    trie = NULL;
422b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    sset = NULL;
423b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    index = 0;
424b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    rindex = 0;
425b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    scriptSetOwned = TRUE;
426b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru}
427b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
428b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruBuilderScriptSet::~BuilderScriptSet() {
429b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (scriptSetOwned) {
430b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        delete sset;
431b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
432b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru}
433b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
434b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif
435b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
436b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
437