16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org****************************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Copyright (C) 2008-2013, International Business Machines 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Corporation and others. All Rights Reserved. 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org****************************************************************************** 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* file name: uspoof_wsconf.cpp 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* encoding: US-ASCII 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* tab size: 8 (not used) 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* indentation:4 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created on: 2009Jan05 (refactoring earlier files) 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created by: Andy Heninger 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Internal functions for compililing Whole Script confusable source data 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* into its binary (runtime) form. The binary data format is described 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* in uspoof_impl.h 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uspoof.h" 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_NORMALIZATION 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_REGULAR_EXPRESSIONS 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/unorm.h" 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uregex.h" 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ustring.h" 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cmemory.h" 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "scriptset.h" 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uspoof_impl.h" 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uhash.h" 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uvector.h" 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uassert.h" 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uspoof_wsconf.h" 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_USE 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Example Lines: 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// | | | | 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// | | | |---- Which table, Any Case or Lower Case (A or L) 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// | | |----------Target script. We need this. 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// | |----------------Src script. Should match the script of the source 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// | code points. Beyond checking that, we don't keep it. 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// |--------------------------------Source code points or range. 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// The expression will match _all_ lines, including erroneous lines. 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// The result of the parse is returned via the contents of the (match) groups. 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic const char *parseExp = 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "(?m)" // Multi-line mode 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "|^(?:" // OR 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "[ \\t]*(?:#.*?)?" // Trailing commment 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ")$|" // OR 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org "^(.*?)$"; // An error line. Group 8. 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Any line not matching the preceding 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // parts of the expression.will match 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // this, and thus be flagged as an error 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Extract a regular expression match group into a char * string. 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// The group must contain only invariant characters. 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Used for script names 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic void extractGroup( 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar ubuf[50]; 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ubuf[0] = 0; 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org destBuf[0] = 0; 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t len = uregex_group(e, group, ubuf, 50, &status); 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status) || len == -1 || len >= destCapacity) { 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString s(FALSE, ubuf, len); // Aliasing constructor 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org s.extract(0, len, destBuf, destCapacity, US_INV); 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Build the Whole Script Confusable data 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// because everything is local to this one build function anyhow, 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// OR 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// break this function into more reasonably sized pieces, with 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// state in WSConfusableDataBuilder. 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org URegularExpression *parseRegexp = NULL; 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t inputLen = 0; 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar *input = NULL; 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lineNum = 0; 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UVector *scriptSets = NULL; 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t rtScriptSetsCount = 2; 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UTrie2 *anyCaseTrie = NULL; 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UTrie2 *lowerCaseTrie = NULL; 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org anyCaseTrie = utrie2_open(0, 0, &status); 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lowerCaseTrie = utrie2_open(0, 0, &status); 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString pattern(parseExp, -1, US_INV); 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The scriptSets vector provides a mapping from TRIE values to the set of scripts. 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Reserved TRIE values: 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 0: Code point has no whole script confusables. 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 1: Code point is of script Common or Inherited. 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // These code points do not participate in whole script confusable detection. 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // (This is logically equivalent to saying that they contain confusables in 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // all scripts) 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Because Trie values are indexes into the ScriptSets vector, pre-fill 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // vector positions 0 and 1 to avoid conflicts with the reserved values. 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptSets = new UVector(status); 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (scriptSets == NULL) { 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptSets->addElement((void *)NULL, status); 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptSets->addElement((void *)NULL, status); 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Convert the user input data from UTF-8 to UChar (UTF-16) 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (status != U_BUFFER_OVERFLOW_ERROR) { 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_ZERO_ERROR; 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (input == NULL) { 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // given the syntax of the input. 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (*input == 0xfeff) { 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *input = 0x20; 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Parse the input, one line per iteration of this loop. 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uregex_setText(parseRegexp, input, inputLen, &status); 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (uregex_findNext(parseRegexp, &status)) { 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lineNum++; 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (uregex_start(parseRegexp, 1, &status) >= 0) { 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // this was a blank or comment line. 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (uregex_start(parseRegexp, 8, &status) >= 0) { 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // input file syntax error. 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_PARSE_ERROR; 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Pick up the start and optional range end code points from the parsed line. 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 startCodePoint = SpoofImpl::ScanHex( 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 endCodePoint = startCodePoint; 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (uregex_start(parseRegexp, 3, &status) >=0) { 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endCodePoint = SpoofImpl::ScanHex( 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Extract the two script names from the source line. We need these in an 8 bit 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to the ICU u_getPropertyValueEnum() function. Ugh. 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char srcScriptName[20]; 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org char targScriptName[20]; 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UScriptCode srcScript = 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UScriptCode targScript = 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_INVALID_FORMAT_ERROR; 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // select the table - (A) any case or (L) lower case only 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UTrie2 *table = anyCaseTrie; 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (uregex_start(parseRegexp, 7, &status) >= 0) { 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org table = lowerCaseTrie; 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Build the set of scripts containing confusable characters for 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the code point(s) specified in this input line. 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Sanity check that the script of the source code point is the same 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // as the source script indicated in the input file. Failure of this check is 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // an error in the input file. 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Include the source script in the set (needed for Mixed Script Confusable detection). 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 cp; 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (cp=startCodePoint; cp<=endCodePoint; cp++) { 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t setIndex = utrie2_get32(table, cp); 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BuilderScriptSet *bsset = NULL; 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (setIndex > 0) { 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U_ASSERT(setIndex < scriptSets->size()); 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset = new BuilderScriptSet(); 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (bsset == NULL) { 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset->codePoint = cp; 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset->trie = table; 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset->sset = new ScriptSet(); 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org setIndex = scriptSets->size(); 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset->index = setIndex; 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset->rindex = 0; 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (bsset->sset == NULL) { 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_MEMORY_ALLOCATION_ERROR; 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptSets->addElement(bsset, status); 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_set32(table, cp, setIndex, &status); 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset->sset->set(targScript, status); 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bsset->sset->set(srcScript, status); 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UScriptCode cpScript = uscript_getScript(cp, &status); 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (cpScript != srcScript) { 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_INVALID_FORMAT_ERROR; 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Eliminate duplicate script sets. At this point we have a separate 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // script set for every code point that had data in the input file. 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // printf("Number of scriptSets: %d\n", scriptSets->size()); 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org { 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t duplicateCount = 0; 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rtScriptSetsCount = 2; 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (outerSet->index != static_cast<uint32_t>(outeri)) { 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // This set was already identified as a duplicate. 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // It will not be allocated a position in the runtime array of ScriptSets. 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org outerSet->rindex = rtScriptSetsCount++; 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete innerSet->sset; 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org innerSet->scriptSetOwned = FALSE; 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org innerSet->sset = outerSet->sset; 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org innerSet->index = outeri; 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org innerSet->rindex = outerSet->rindex; 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org duplicateCount++; 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // But this doesn't get all. We need to fix the TRIE. 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Update the Trie values to be reflect the run time script indexes (after duplicate merging). 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // are unused, which is why the loop index starts at 2.) 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org { 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t i=2; i<scriptSets->size(); i++) { 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (bSet->rindex != (uint32_t)i) { 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // For code points with script==Common or script==Inherited, 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Set the reserved value of 1 into both Tries. These characters do not participate 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // in Whole Script Confusable detection; this reserved value is the means 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // by which they are detected. 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org { 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSet ignoreSet; 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSet inheritedSet; 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ignoreSet.addAll(inheritedSet); 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 rangeStart = ignoreSet.getRangeStart(rn); 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Serialize the data to the Spoof Detector 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org { 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // printf("Any case Trie size: %d\n", size); 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (status != U_BUFFER_OVERFLOW_ERROR) { 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_ZERO_ERROR; 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org void *where = spImpl->fSpoofData->reserveSpace(size, status); 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_serialize(anyCaseTrie, where, size, &status); 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // printf("Lower case Trie size: %d\n", size); 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (status != U_BUFFER_OVERFLOW_ERROR) { 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto cleanup; 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org status = U_ZERO_ERROR; 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org where = spImpl->fSpoofData->reserveSpace(size, status); 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_serialize(lowerCaseTrie, where, size, &status); 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ScriptSet *rtScriptSets = static_cast<ScriptSet *> 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t rindex = 2; 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t i=2; i<scriptSets->size(); i++) { 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (bSet->rindex < rindex) { 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We have already copied this script set to the serialized data. 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U_ASSERT(rindex == bSet->rindex); 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rindex++; 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Open new utrie2s from the serialized data. We don't want to keep the ones 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // we just built because we would then have two copies of the data, one internal to 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the utries that we have already constructed, and one in the serialized data area. 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // An alternative would be to not pre-serialize the Trie data, but that makes the 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // spoof detector data different, depending on how the detector was constructed. 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // It's simpler to keep the data always the same. 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UTRIE2_16_VALUE_BITS, 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org NULL, 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org &status); 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UTRIE2_16_VALUE_BITS, 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org NULL, 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org &status); 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgcleanup: 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pe->line = lineNum; 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uregex_close(parseRegexp); 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(input); 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t i; 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (scriptSets != NULL) { 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (i=0; i<scriptSets->size(); i++) { 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete bsset; 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete scriptSets; 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_close(anyCaseTrie); 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utrie2_close(lowerCaseTrie); 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBuilderScriptSet::BuilderScriptSet() { 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org codePoint = -1; 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org trie = NULL; 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org sset = NULL; 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org index = 0; 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rindex = 0; 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scriptSetOwned = TRUE; 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgBuilderScriptSet::~BuilderScriptSet() { 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (scriptSetOwned) { 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete sset; 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 437