1b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru/* 2b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru****************************************************************************** 3b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* 48393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius* Copyright (C) 2008-2013, International Business Machines 5b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Corporation and others. All Rights Reserved. 6b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* 7b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru****************************************************************************** 850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* file name: uspoof_wsconf.cpp 9b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* encoding: US-ASCII 10b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* tab size: 8 (not used) 11b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* indentation:4 12b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* 13b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* created on: 2009Jan05 (refactoring earlier files) 14b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* created by: Andy Heninger 15b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* 16b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* Internal functions for compililing Whole Script confusable source data 17b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* into its binary (runtime) form. The binary data format is described 18b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru* in uspoof_impl.h 19b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru*/ 20b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 21b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/utypes.h" 22b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uspoof.h" 23b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 24b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if !UCONFIG_NO_NORMALIZATION 25b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 26b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 27b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 28b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/unorm.h" 29b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uregex.h" 30b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/ustring.h" 31b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "cmemory.h" 328393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius#include "scriptset.h" 33b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uspoof_impl.h" 34b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uhash.h" 35b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uvector.h" 36b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uassert.h" 3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uspoof_wsconf.h" 38b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 39b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_NAMESPACE_USE 40b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 41b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 42b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt 43b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// Example Lines: 44b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O 45b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I 46b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// | | | | 47b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// | | | |---- Which table, Any Case or Lower Case (A or L) 48b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// | | |----------Target script. We need this. 49b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// | |----------------Src script. Should match the script of the source 50b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// | code points. Beyond checking that, we don't keep it. 51b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// |--------------------------------Source code points or range. 52b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// 53b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// The expression will match _all_ lines, including erroneous lines. 54b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// The result of the parse is returned via the contents of the (match) groups. 55b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querustatic const char *parseExp = 56b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "(?m)" // Multi-line mode 57b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. 58b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "|^(?:" // OR 59b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. 60b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. 61b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. 62b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 63b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "[ \\t]*(?:#.*?)?" // Trailing commment 64b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ")$|" // OR 65b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru "^(.*?)$"; // An error line. Group 8. 66b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Any line not matching the preceding 67b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // parts of the expression.will match 68b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // this, and thus be flagged as an error 69b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 70b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 71b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// Extract a regular expression match group into a char * string. 72b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// The group must contain only invariant characters. 73b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// Used for script names 74b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// 75b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querustatic void extractGroup( 76b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { 77b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 78b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar ubuf[50]; 79b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ubuf[0] = 0; 80b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru destBuf[0] = 0; 81b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t len = uregex_group(e, group, ubuf, 50, &status); 82b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status) || len == -1 || len >= destCapacity) { 83b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return; 84b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 85b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeString s(FALSE, ubuf, len); // Aliasing constructor 86b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru s.extract(0, len, destBuf, destCapacity, US_INV); 87b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 88b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 89b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 90b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 91103e9ffba2cba345d0078eb8b8db33249f81840aCraig CorneliusU_NAMESPACE_BEGIN 92103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 93b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// Build the Whole Script Confusable data 94b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// 95b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, 96b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// because everything is local to this one build function anyhow, 97b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// OR 98b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// break this function into more reasonably sized pieces, with 99b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// state in WSConfusableDataBuilder. 100b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// 101b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruvoid buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, 102b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 103b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru{ 104b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status)) { 105b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return; 106b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 107b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru URegularExpression *parseRegexp = NULL; 108b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t inputLen = 0; 109b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar *input = NULL; 110b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t lineNum = 0; 111b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 112b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UVector *scriptSets = NULL; 113b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t rtScriptSetsCount = 2; 114b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 115b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UTrie2 *anyCaseTrie = NULL; 116b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UTrie2 *lowerCaseTrie = NULL; 117b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 118b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru anyCaseTrie = utrie2_open(0, 0, &status); 119b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru lowerCaseTrie = utrie2_open(0, 0, &status); 120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString pattern(parseExp, -1, US_INV); 122b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 123b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // The scriptSets vector provides a mapping from TRIE values to the set of scripts. 124b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // 125b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Reserved TRIE values: 126b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // 0: Code point has no whole script confusables. 127b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // 1: Code point is of script Common or Inherited. 128b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // These code points do not participate in whole script confusable detection. 129b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // (This is logically equivalent to saying that they contain confusables in 130b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // all scripts) 131b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // 132b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Because Trie values are indexes into the ScriptSets vector, pre-fill 133b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // vector positions 0 and 1 to avoid conflicts with the reserved values. 134b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 135b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru scriptSets = new UVector(status); 136b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (scriptSets == NULL) { 137b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 138b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 139b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 140b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru scriptSets->addElement((void *)NULL, status); 141b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru scriptSets->addElement((void *)NULL, status); 142b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 143b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Convert the user input data from UTF-8 to UChar (UTF-16) 144b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); 145b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (status != U_BUFFER_OVERFLOW_ERROR) { 146b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 147b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 148b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_ZERO_ERROR; 149b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); 150b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (input == NULL) { 151b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 152b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 153b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 154b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); 155b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 156b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 157b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 158b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 159b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // given the syntax of the input. 160b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (*input == 0xfeff) { 161b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *input = 0x20; 162b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 163b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 164b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Parse the input, one line per iteration of this loop. 165b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uregex_setText(parseRegexp, input, inputLen, &status); 166b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru while (uregex_findNext(parseRegexp, &status)) { 167b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru lineNum++; 168b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (uregex_start(parseRegexp, 1, &status) >= 0) { 169b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // this was a blank or comment line. 170b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru continue; 171b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 172b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (uregex_start(parseRegexp, 8, &status) >= 0) { 173b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // input file syntax error. 174b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_PARSE_ERROR; 175b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 176b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 177b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status)) { 178b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 179b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 180b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 181b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Pick up the start and optional range end code points from the parsed line. 182b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar32 startCodePoint = SpoofImpl::ScanHex( 183b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); 184b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar32 endCodePoint = startCodePoint; 185b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (uregex_start(parseRegexp, 3, &status) >=0) { 186b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru endCodePoint = SpoofImpl::ScanHex( 187b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); 188b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 189b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 190b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Extract the two script names from the source line. We need these in an 8 bit 191b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on 192b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // to the ICU u_getPropertyValueEnum() function. Ugh. 193b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru char srcScriptName[20]; 194b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru char targScriptName[20]; 195b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); 196b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); 197b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UScriptCode srcScript = 198b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); 199b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UScriptCode targScript = 200b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); 201b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status)) { 202b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 203b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 204b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { 205b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_INVALID_FORMAT_ERROR; 206b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 207b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 208b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 209b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // select the table - (A) any case or (L) lower case only 210b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UTrie2 *table = anyCaseTrie; 211b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (uregex_start(parseRegexp, 7, &status) >= 0) { 212b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru table = lowerCaseTrie; 213b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 214b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 215b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Build the set of scripts containing confusable characters for 216b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // the code point(s) specified in this input line. 217b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Sanity check that the script of the source code point is the same 218b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // as the source script indicated in the input file. Failure of this check is 219b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // an error in the input file. 220b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Include the source script in the set (needed for Mixed Script Confusable detection). 221b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // 222b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar32 cp; 223b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (cp=startCodePoint; cp<=endCodePoint; cp++) { 224b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t setIndex = utrie2_get32(table, cp); 225b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru BuilderScriptSet *bsset = NULL; 226b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (setIndex > 0) { 227b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru U_ASSERT(setIndex < scriptSets->size()); 228b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); 229b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } else { 230b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru bsset = new BuilderScriptSet(); 231b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (bsset == NULL) { 232b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 233b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 234b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 235b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru bsset->codePoint = cp; 236b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru bsset->trie = table; 237b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru bsset->sset = new ScriptSet(); 238b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru setIndex = scriptSets->size(); 239b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru bsset->index = setIndex; 240b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru bsset->rindex = 0; 241b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (bsset->sset == NULL) { 242b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 243b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 244b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 245b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru scriptSets->addElement(bsset, status); 246b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_set32(table, cp, setIndex, &status); 247b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 2488393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius bsset->sset->set(targScript, status); 2498393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius bsset->sset->set(srcScript, status); 250b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 251b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status)) { 252b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 253b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 254b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UScriptCode cpScript = uscript_getScript(cp, &status); 255b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (cpScript != srcScript) { 256b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_INVALID_FORMAT_ERROR; 257b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 258b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 259b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 260b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 261b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 262b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Eliminate duplicate script sets. At this point we have a separate 263b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // script set for every code point that had data in the input file. 264b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // 265b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them 266b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // 267b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // printf("Number of scriptSets: %d\n", scriptSets->size()); 268b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru { 269b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t duplicateCount = 0; 270b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru rtScriptSetsCount = 2; 271b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { 272b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); 273b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (outerSet->index != static_cast<uint32_t>(outeri)) { 274b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // This set was already identified as a duplicate. 275b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // It will not be allocated a position in the runtime array of ScriptSets. 276b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru continue; 277b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 278b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru outerSet->rindex = rtScriptSetsCount++; 279b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { 280b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); 281b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { 282b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru delete innerSet->sset; 283b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru innerSet->scriptSetOwned = FALSE; 284b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru innerSet->sset = outerSet->sset; 285b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru innerSet->index = outeri; 286b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru innerSet->rindex = outerSet->rindex; 287b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru duplicateCount++; 288b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 289b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // But this doesn't get all. We need to fix the TRIE. 290b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 291b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 292b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); 293b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 294b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 295b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 296b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 297b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Update the Trie values to be reflect the run time script indexes (after duplicate merging). 298b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets 299b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // are unused, which is why the loop index starts at 2.) 300b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru { 301b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (int32_t i=2; i<scriptSets->size(); i++) { 302b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 303b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (bSet->rindex != (uint32_t)i) { 304b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); 305b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 306b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 307b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 308b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 309b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // For code points with script==Common or script==Inherited, 310b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Set the reserved value of 1 into both Tries. These characters do not participate 311b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // in Whole Script Confusable detection; this reserved value is the means 312b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // by which they are detected. 313b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru { 314b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeSet ignoreSet; 315b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 316b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UnicodeSet inheritedSet; 317b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 318b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ignoreSet.addAll(inheritedSet); 319b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { 320b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar32 rangeStart = ignoreSet.getRangeStart(rn); 321b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); 322b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 323b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 324b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 325b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 326b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 327b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Serialize the data to the Spoof Detector 328b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru { 329b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); 330b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); 331b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // printf("Any case Trie size: %d\n", size); 332b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (status != U_BUFFER_OVERFLOW_ERROR) { 333b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 334b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 335b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_ZERO_ERROR; 336b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; 337b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; 338b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; 339b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru void *where = spImpl->fSpoofData->reserveSpace(size, status); 340b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_serialize(anyCaseTrie, where, size, &status); 341b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 342b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); 343b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); 344b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // printf("Lower case Trie size: %d\n", size); 345b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (status != U_BUFFER_OVERFLOW_ERROR) { 346b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru goto cleanup; 347b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 348b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru status = U_ZERO_ERROR; 349b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; 350b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; 351b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; 352b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru where = spImpl->fSpoofData->reserveSpace(size, status); 353b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_serialize(lowerCaseTrie, where, size, &status); 354b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 355b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; 356b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; 357b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ScriptSet *rtScriptSets = static_cast<ScriptSet *> 358b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); 359b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uint32_t rindex = 2; 360b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru for (int32_t i=2; i<scriptSets->size(); i++) { 361b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 362b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (bSet->rindex < rindex) { 363b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // We have already copied this script set to the serialized data. 364b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru continue; 365b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 366b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru U_ASSERT(rindex == bSet->rindex); 367b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. 368b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru rindex++; 369b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 370b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 371b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 372b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Open new utrie2s from the serialized data. We don't want to keep the ones 373b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // we just built because we would then have two copies of the data, one internal to 374b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // the utries that we have already constructed, and one in the serialized data area. 375b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // An alternative would be to not pre-serialize the Trie data, but that makes the 376b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // spoof detector data different, depending on how the detector was constructed. 377b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // It's simpler to keep the data always the same. 378b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 379b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( 380b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UTRIE2_16_VALUE_BITS, 381b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, 382b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 383b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru NULL, 384b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru &status); 385b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 386b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( 387b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru UTRIE2_16_VALUE_BITS, 388b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, 389b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 390b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru NULL, 391b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru &status); 392b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 393b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 394b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 395b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Querucleanup: 396b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(status)) { 397b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru pe->line = lineNum; 398b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 399b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uregex_close(parseRegexp); 400b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru uprv_free(input); 401b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 402b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t i; 403103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (scriptSets != NULL) { 404103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius for (i=0; i<scriptSets->size(); i++) { 405103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 406103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius delete bsset; 407103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 408103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius delete scriptSets; 409b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 410b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_close(anyCaseTrie); 411b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru utrie2_close(lowerCaseTrie); 412b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return; 413b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 414b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 415103e9ffba2cba345d0078eb8b8db33249f81840aCraig CorneliusU_NAMESPACE_END 416b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 417b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 418b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 419b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruBuilderScriptSet::BuilderScriptSet() { 420b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru codePoint = -1; 421b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru trie = NULL; 422b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru sset = NULL; 423b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru index = 0; 424b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru rindex = 0; 425b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru scriptSetOwned = TRUE; 426b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 427b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 428b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruBuilderScriptSet::~BuilderScriptSet() { 429b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (scriptSetOwned) { 430b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru delete sset; 431b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 432b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 433b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 434b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif 435b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 436b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 437