1/* 2****************************************************************************** 3* 4* Copyright (C) 2008-2013, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7****************************************************************************** 8* file name: uspoof_wsconf.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2009Jan05 (refactoring earlier files) 14* created by: Andy Heninger 15* 16* Internal functions for compililing Whole Script confusable source data 17* into its binary (runtime) form. The binary data format is described 18* in uspoof_impl.h 19*/ 20 21#include "unicode/utypes.h" 22#include "unicode/uspoof.h" 23 24#if !UCONFIG_NO_NORMALIZATION 25 26#if !UCONFIG_NO_REGULAR_EXPRESSIONS 27 28#include "unicode/unorm.h" 29#include "unicode/uregex.h" 30#include "unicode/ustring.h" 31#include "cmemory.h" 32#include "scriptset.h" 33#include "uspoof_impl.h" 34#include "uhash.h" 35#include "uvector.h" 36#include "uassert.h" 37#include "uspoof_wsconf.h" 38 39U_NAMESPACE_USE 40 41 42// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt 43// Example Lines: 44// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O 45// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I 46// | | | | 47// | | | |---- Which table, Any Case or Lower Case (A or L) 48// | | |----------Target script. We need this. 49// | |----------------Src script. Should match the script of the source 50// | code points. Beyond checking that, we don't keep it. 51// |--------------------------------Source code points or range. 52// 53// The expression will match _all_ lines, including erroneous lines. 54// The result of the parse is returned via the contents of the (match) groups. 55static const char *parseExp = 56 "(?m)" // Multi-line mode 57 "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. 58 "|^(?:" // OR 59 "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. 60 "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. 61 "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. 62 "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 63 "[ \\t]*(?:#.*?)?" // Trailing commment 64 ")$|" // OR 65 "^(.*?)$"; // An error line. Group 8. 66 // Any line not matching the preceding 67 // parts of the expression.will match 68 // this, and thus be flagged as an error 69 70 71// Extract a regular expression match group into a char * string. 72// The group must contain only invariant characters. 73// Used for script names 74// 75static void extractGroup( 76 URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { 77 78 UChar ubuf[50]; 79 ubuf[0] = 0; 80 destBuf[0] = 0; 81 int32_t len = uregex_group(e, group, ubuf, 50, &status); 82 if (U_FAILURE(status) || len == -1 || len >= destCapacity) { 83 return; 84 } 85 UnicodeString s(FALSE, ubuf, len); // Aliasing constructor 86 s.extract(0, len, destBuf, destCapacity, US_INV); 87} 88 89 90 91U_NAMESPACE_BEGIN 92 93// Build the Whole Script Confusable data 94// 95// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, 96// because everything is local to this one build function anyhow, 97// OR 98// break this function into more reasonably sized pieces, with 99// state in WSConfusableDataBuilder. 100// 101void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, 102 int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 103{ 104 if (U_FAILURE(status)) { 105 return; 106 } 107 URegularExpression *parseRegexp = NULL; 108 int32_t inputLen = 0; 109 UChar *input = NULL; 110 int32_t lineNum = 0; 111 112 UVector *scriptSets = NULL; 113 uint32_t rtScriptSetsCount = 2; 114 115 UTrie2 *anyCaseTrie = NULL; 116 UTrie2 *lowerCaseTrie = NULL; 117 118 anyCaseTrie = utrie2_open(0, 0, &status); 119 lowerCaseTrie = utrie2_open(0, 0, &status); 120 121 UnicodeString pattern(parseExp, -1, US_INV); 122 123 // The scriptSets vector provides a mapping from TRIE values to the set of scripts. 124 // 125 // Reserved TRIE values: 126 // 0: Code point has no whole script confusables. 127 // 1: Code point is of script Common or Inherited. 128 // These code points do not participate in whole script confusable detection. 129 // (This is logically equivalent to saying that they contain confusables in 130 // all scripts) 131 // 132 // Because Trie values are indexes into the ScriptSets vector, pre-fill 133 // vector positions 0 and 1 to avoid conflicts with the reserved values. 134 135 scriptSets = new UVector(status); 136 if (scriptSets == NULL) { 137 status = U_MEMORY_ALLOCATION_ERROR; 138 goto cleanup; 139 } 140 scriptSets->addElement((void *)NULL, status); 141 scriptSets->addElement((void *)NULL, status); 142 143 // Convert the user input data from UTF-8 to UChar (UTF-16) 144 u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); 145 if (status != U_BUFFER_OVERFLOW_ERROR) { 146 goto cleanup; 147 } 148 status = U_ZERO_ERROR; 149 input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); 150 if (input == NULL) { 151 status = U_MEMORY_ALLOCATION_ERROR; 152 goto cleanup; 153 } 154 u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); 155 156 parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 157 158 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 159 // given the syntax of the input. 160 if (*input == 0xfeff) { 161 *input = 0x20; 162 } 163 164 // Parse the input, one line per iteration of this loop. 165 uregex_setText(parseRegexp, input, inputLen, &status); 166 while (uregex_findNext(parseRegexp, &status)) { 167 lineNum++; 168 if (uregex_start(parseRegexp, 1, &status) >= 0) { 169 // this was a blank or comment line. 170 continue; 171 } 172 if (uregex_start(parseRegexp, 8, &status) >= 0) { 173 // input file syntax error. 174 status = U_PARSE_ERROR; 175 goto cleanup; 176 } 177 if (U_FAILURE(status)) { 178 goto cleanup; 179 } 180 181 // Pick up the start and optional range end code points from the parsed line. 182 UChar32 startCodePoint = SpoofImpl::ScanHex( 183 input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); 184 UChar32 endCodePoint = startCodePoint; 185 if (uregex_start(parseRegexp, 3, &status) >=0) { 186 endCodePoint = SpoofImpl::ScanHex( 187 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); 188 } 189 190 // Extract the two script names from the source line. We need these in an 8 bit 191 // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on 192 // to the ICU u_getPropertyValueEnum() function. Ugh. 193 char srcScriptName[20]; 194 char targScriptName[20]; 195 extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); 196 extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); 197 UScriptCode srcScript = 198 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); 199 UScriptCode targScript = 200 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); 201 if (U_FAILURE(status)) { 202 goto cleanup; 203 } 204 if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { 205 status = U_INVALID_FORMAT_ERROR; 206 goto cleanup; 207 } 208 209 // select the table - (A) any case or (L) lower case only 210 UTrie2 *table = anyCaseTrie; 211 if (uregex_start(parseRegexp, 7, &status) >= 0) { 212 table = lowerCaseTrie; 213 } 214 215 // Build the set of scripts containing confusable characters for 216 // the code point(s) specified in this input line. 217 // Sanity check that the script of the source code point is the same 218 // as the source script indicated in the input file. Failure of this check is 219 // an error in the input file. 220 // Include the source script in the set (needed for Mixed Script Confusable detection). 221 // 222 UChar32 cp; 223 for (cp=startCodePoint; cp<=endCodePoint; cp++) { 224 int32_t setIndex = utrie2_get32(table, cp); 225 BuilderScriptSet *bsset = NULL; 226 if (setIndex > 0) { 227 U_ASSERT(setIndex < scriptSets->size()); 228 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); 229 } else { 230 bsset = new BuilderScriptSet(); 231 if (bsset == NULL) { 232 status = U_MEMORY_ALLOCATION_ERROR; 233 goto cleanup; 234 } 235 bsset->codePoint = cp; 236 bsset->trie = table; 237 bsset->sset = new ScriptSet(); 238 setIndex = scriptSets->size(); 239 bsset->index = setIndex; 240 bsset->rindex = 0; 241 if (bsset->sset == NULL) { 242 status = U_MEMORY_ALLOCATION_ERROR; 243 goto cleanup; 244 } 245 scriptSets->addElement(bsset, status); 246 utrie2_set32(table, cp, setIndex, &status); 247 } 248 bsset->sset->set(targScript, status); 249 bsset->sset->set(srcScript, status); 250 251 if (U_FAILURE(status)) { 252 goto cleanup; 253 } 254 UScriptCode cpScript = uscript_getScript(cp, &status); 255 if (cpScript != srcScript) { 256 status = U_INVALID_FORMAT_ERROR; 257 goto cleanup; 258 } 259 } 260 } 261 262 // Eliminate duplicate script sets. At this point we have a separate 263 // script set for every code point that had data in the input file. 264 // 265 // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them 266 // 267 // printf("Number of scriptSets: %d\n", scriptSets->size()); 268 { 269 int32_t duplicateCount = 0; 270 rtScriptSetsCount = 2; 271 for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { 272 BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); 273 if (outerSet->index != static_cast<uint32_t>(outeri)) { 274 // This set was already identified as a duplicate. 275 // It will not be allocated a position in the runtime array of ScriptSets. 276 continue; 277 } 278 outerSet->rindex = rtScriptSetsCount++; 279 for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { 280 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); 281 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { 282 delete innerSet->sset; 283 innerSet->scriptSetOwned = FALSE; 284 innerSet->sset = outerSet->sset; 285 innerSet->index = outeri; 286 innerSet->rindex = outerSet->rindex; 287 duplicateCount++; 288 } 289 // But this doesn't get all. We need to fix the TRIE. 290 } 291 } 292 // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); 293 } 294 295 296 297 // Update the Trie values to be reflect the run time script indexes (after duplicate merging). 298 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets 299 // are unused, which is why the loop index starts at 2.) 300 { 301 for (int32_t i=2; i<scriptSets->size(); i++) { 302 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 303 if (bSet->rindex != (uint32_t)i) { 304 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); 305 } 306 } 307 } 308 309 // For code points with script==Common or script==Inherited, 310 // Set the reserved value of 1 into both Tries. These characters do not participate 311 // in Whole Script Confusable detection; this reserved value is the means 312 // by which they are detected. 313 { 314 UnicodeSet ignoreSet; 315 ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 316 UnicodeSet inheritedSet; 317 inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 318 ignoreSet.addAll(inheritedSet); 319 for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { 320 UChar32 rangeStart = ignoreSet.getRangeStart(rn); 321 UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); 322 utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 323 utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); 324 } 325 } 326 327 // Serialize the data to the Spoof Detector 328 { 329 utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); 330 int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); 331 // printf("Any case Trie size: %d\n", size); 332 if (status != U_BUFFER_OVERFLOW_ERROR) { 333 goto cleanup; 334 } 335 status = U_ZERO_ERROR; 336 spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; 337 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; 338 spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; 339 void *where = spImpl->fSpoofData->reserveSpace(size, status); 340 utrie2_serialize(anyCaseTrie, where, size, &status); 341 342 utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); 343 size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); 344 // printf("Lower case Trie size: %d\n", size); 345 if (status != U_BUFFER_OVERFLOW_ERROR) { 346 goto cleanup; 347 } 348 status = U_ZERO_ERROR; 349 spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; 350 spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; 351 spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; 352 where = spImpl->fSpoofData->reserveSpace(size, status); 353 utrie2_serialize(lowerCaseTrie, where, size, &status); 354 355 spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; 356 spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; 357 ScriptSet *rtScriptSets = static_cast<ScriptSet *> 358 (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); 359 uint32_t rindex = 2; 360 for (int32_t i=2; i<scriptSets->size(); i++) { 361 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 362 if (bSet->rindex < rindex) { 363 // We have already copied this script set to the serialized data. 364 continue; 365 } 366 U_ASSERT(rindex == bSet->rindex); 367 rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. 368 rindex++; 369 } 370 } 371 372 // Open new utrie2s from the serialized data. We don't want to keep the ones 373 // we just built because we would then have two copies of the data, one internal to 374 // the utries that we have already constructed, and one in the serialized data area. 375 // An alternative would be to not pre-serialize the Trie data, but that makes the 376 // spoof detector data different, depending on how the detector was constructed. 377 // It's simpler to keep the data always the same. 378 379 spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( 380 UTRIE2_16_VALUE_BITS, 381 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, 382 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 383 NULL, 384 &status); 385 386 spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( 387 UTRIE2_16_VALUE_BITS, 388 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, 389 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, 390 NULL, 391 &status); 392 393 394 395cleanup: 396 if (U_FAILURE(status)) { 397 pe->line = lineNum; 398 } 399 uregex_close(parseRegexp); 400 uprv_free(input); 401 402 int32_t i; 403 if (scriptSets != NULL) { 404 for (i=0; i<scriptSets->size(); i++) { 405 BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); 406 delete bsset; 407 } 408 delete scriptSets; 409 } 410 utrie2_close(anyCaseTrie); 411 utrie2_close(lowerCaseTrie); 412 return; 413} 414 415U_NAMESPACE_END 416 417 418 419BuilderScriptSet::BuilderScriptSet() { 420 codePoint = -1; 421 trie = NULL; 422 sset = NULL; 423 index = 0; 424 rindex = 0; 425 scriptSetOwned = TRUE; 426} 427 428BuilderScriptSet::~BuilderScriptSet() { 429 if (scriptSetOwned) { 430 delete sset; 431 } 432} 433 434#endif 435#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 436 437