1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4****************************************************************************** 5* 6* Copyright (C) 2008-2015, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9****************************************************************************** 10* file name: uspoof_conf.cpp 11* encoding: US-ASCII 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2009Jan05 (refactoring earlier files) 16* created by: Andy Heninger 17* 18* Internal classes for compililing confusable data into its binary (runtime) form. 19*/ 20 21#include "unicode/utypes.h" 22#include "unicode/uspoof.h" 23#if !UCONFIG_NO_REGULAR_EXPRESSIONS 24#if !UCONFIG_NO_NORMALIZATION 25 26#include "unicode/unorm.h" 27#include "unicode/uregex.h" 28#include "unicode/ustring.h" 29#include "cmemory.h" 30#include "uspoof_impl.h" 31#include "uhash.h" 32#include "uvector.h" 33#include "uassert.h" 34#include "uarrsort.h" 35#include "uspoof_conf.h" 36 37U_NAMESPACE_USE 38 39 40//--------------------------------------------------------------------- 41// 42// buildConfusableData Compile the source confusable data, as defined by 43// the Unicode data file confusables.txt, into the binary 44// structures used by the confusable detector. 45// 46// The binary structures are described in uspoof_impl.h 47// 48// 1. Parse the data, making a hash table mapping from a UChar32 to a String. 49// 50// 2. Sort all of the strings encountered by length, since they will need to 51// be stored in that order in the final string table. 52// TODO: Sorting these strings by length is no longer needed since the removal of 53// the string lengths table. This logic can be removed to save processing time 54// when building confusables data. 55// 56// 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the 57// list because that will be the ordering of our runtime table. 58// 59// 4. Generate the run time string table. This is generated before the key & value 60// tables because we need the string indexes when building those tables. 61// 62// 5. Build the run-time key and value tables. These are parallel tables, and are built 63// at the same time 64// 65 66SPUString::SPUString(UnicodeString *s) { 67 fStr = s; 68 fCharOrStrTableIndex = 0; 69} 70 71 72SPUString::~SPUString() { 73 delete fStr; 74} 75 76 77SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) { 78 fVec = new UVector(status); 79 fHash = uhash_open(uhash_hashUnicodeString, // key hash function 80 uhash_compareUnicodeString, // Key Comparator 81 NULL, // Value Comparator 82 &status); 83} 84 85 86SPUStringPool::~SPUStringPool() { 87 int i; 88 for (i=fVec->size()-1; i>=0; i--) { 89 SPUString *s = static_cast<SPUString *>(fVec->elementAt(i)); 90 delete s; 91 } 92 delete fVec; 93 uhash_close(fHash); 94} 95 96 97int32_t SPUStringPool::size() { 98 return fVec->size(); 99} 100 101SPUString *SPUStringPool::getByIndex(int32_t index) { 102 SPUString *retString = (SPUString *)fVec->elementAt(index); 103 return retString; 104} 105 106 107// Comparison function for ordering strings in the string pool. 108// Compare by length first, then, within a group of the same length, 109// by code point order. 110// Conforms to the type signature for a USortComparator in uvector.h 111 112static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) { 113 const SPUString *sL = const_cast<const SPUString *>( 114 static_cast<SPUString *>(left.pointer)); 115 const SPUString *sR = const_cast<const SPUString *>( 116 static_cast<SPUString *>(right.pointer)); 117 int32_t lenL = sL->fStr->length(); 118 int32_t lenR = sR->fStr->length(); 119 if (lenL < lenR) { 120 return -1; 121 } else if (lenL > lenR) { 122 return 1; 123 } else { 124 return sL->fStr->compare(*(sR->fStr)); 125 } 126} 127 128void SPUStringPool::sort(UErrorCode &status) { 129 fVec->sort(SPUStringCompare, status); 130} 131 132 133SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) { 134 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src)); 135 if (hashedString != NULL) { 136 delete src; 137 } else { 138 hashedString = new SPUString(src); 139 uhash_put(fHash, src, hashedString, &status); 140 fVec->addElement(hashedString, status); 141 } 142 return hashedString; 143} 144 145 146 147ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) : 148 fSpoofImpl(spImpl), 149 fInput(NULL), 150 fTable(NULL), 151 fKeySet(NULL), 152 fKeyVec(NULL), 153 fValueVec(NULL), 154 fStringTable(NULL), 155 stringPool(NULL), 156 fParseLine(NULL), 157 fParseHexNum(NULL), 158 fLineNum(0) 159{ 160 if (U_FAILURE(status)) { 161 return; 162 } 163 fTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status); 164 fKeySet = new UnicodeSet(); 165 fKeyVec = new UVector(status); 166 fValueVec = new UVector(status); 167 stringPool = new SPUStringPool(status); 168} 169 170 171ConfusabledataBuilder::~ConfusabledataBuilder() { 172 uprv_free(fInput); 173 uregex_close(fParseLine); 174 uregex_close(fParseHexNum); 175 uhash_close(fTable); 176 delete fKeySet; 177 delete fKeyVec; 178 delete fStringTable; 179 delete fValueVec; 180 delete stringPool; 181} 182 183 184void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables, 185 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) { 186 187 if (U_FAILURE(status)) { 188 return; 189 } 190 ConfusabledataBuilder builder(spImpl, status); 191 builder.build(confusables, confusablesLen, status); 192 if (U_FAILURE(status) && errorType != NULL) { 193 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 194 pe->line = builder.fLineNum; 195 } 196} 197 198 199void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen, 200 UErrorCode &status) { 201 202 // Convert the user input data from UTF-8 to UChar (UTF-16) 203 int32_t inputLen = 0; 204 if (U_FAILURE(status)) { 205 return; 206 } 207 u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status); 208 if (status != U_BUFFER_OVERFLOW_ERROR) { 209 return; 210 } 211 status = U_ZERO_ERROR; 212 fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); 213 if (fInput == NULL) { 214 status = U_MEMORY_ALLOCATION_ERROR; 215 return; 216 } 217 u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status); 218 219 220 // Regular Expression to parse a line from Confusables.txt. The expression will match 221 // any line. What was matched is determined by examining which capture groups have a match. 222 // Capture Group 1: the source char 223 // Capture Group 2: the replacement chars 224 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated) 225 // Capture Group 7: A blank or comment only line. 226 // Capture Group 8: A syntactically invalid line. Anything that didn't match before. 227 // Example Line from the confusables.txt source file: 228 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " 229 UnicodeString pattern( 230 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char 231 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s) 232 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued) 233 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type 234 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment 235 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment 236 "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines. 237 // TODO: Why are we using the regex C API here? C++ would just take UnicodeString... 238 fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 239 240 // Regular expression for parsing a hex number out of a space-separated list of them. 241 // Capture group 1 gets the number, with spaces removed. 242 pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)"); 243 fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); 244 245 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 246 // given the syntax of the input. 247 if (*fInput == 0xfeff) { 248 *fInput = 0x20; 249 } 250 251 // Parse the input, one line per iteration of this loop. 252 uregex_setText(fParseLine, fInput, inputLen, &status); 253 while (uregex_findNext(fParseLine, &status)) { 254 fLineNum++; 255 if (uregex_start(fParseLine, 7, &status) >= 0) { 256 // this was a blank or comment line. 257 continue; 258 } 259 if (uregex_start(fParseLine, 8, &status) >= 0) { 260 // input file syntax error. 261 status = U_PARSE_ERROR; 262 return; 263 } 264 265 // We have a good input line. Extract the key character and mapping string, and 266 // put them into the appropriate mapping table. 267 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status), 268 uregex_end(fParseLine, 1, &status), status); 269 270 int32_t mapStringStart = uregex_start(fParseLine, 2, &status); 271 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart; 272 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status); 273 274 UnicodeString *mapString = new UnicodeString(); 275 if (mapString == NULL) { 276 status = U_MEMORY_ALLOCATION_ERROR; 277 return; 278 } 279 while (uregex_findNext(fParseHexNum, &status)) { 280 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status), 281 uregex_end(fParseHexNum, 1, &status), status); 282 mapString->append(c); 283 } 284 U_ASSERT(mapString->length() >= 1); 285 286 // Put the map (value) string into the string pool 287 // This a little like a Java intern() - any duplicates will be eliminated. 288 SPUString *smapString = stringPool->addString(mapString, status); 289 290 // Add the UChar32 -> string mapping to the table. 291 // For Unicode 8, the SL, SA and ML tables have been discontinued. 292 // All input data from confusables.txt is tagged MA. 293 uhash_iput(fTable, keyChar, smapString, &status); 294 if (U_FAILURE(status)) { return; } 295 fKeySet->add(keyChar); 296 } 297 298 // Input data is now all parsed and collected. 299 // Now create the run-time binary form of the data. 300 // 301 // This is done in two steps. First the data is assembled into vectors and strings, 302 // for ease of construction, then the contents of these collections are dumped 303 // into the actual raw-bytes data storage. 304 305 // Build up the string array, and record the index of each string therein 306 // in the (build time only) string pool. 307 // Strings of length one are not entered into the strings array. 308 // (Strings in the table are sorted by length) 309 stringPool->sort(status); 310 fStringTable = new UnicodeString(); 311 int32_t poolSize = stringPool->size(); 312 int32_t i; 313 for (i=0; i<poolSize; i++) { 314 SPUString *s = stringPool->getByIndex(i); 315 int32_t strLen = s->fStr->length(); 316 int32_t strIndex = fStringTable->length(); 317 if (strLen == 1) { 318 // strings of length one do not get an entry in the string table. 319 // Keep the single string character itself here, which is the same 320 // convention that is used in the final run-time string table index. 321 s->fCharOrStrTableIndex = s->fStr->charAt(0); 322 } else { 323 s->fCharOrStrTableIndex = strIndex; 324 fStringTable->append(*(s->fStr)); 325 } 326 } 327 328 // Construct the compile-time Key and Value tables 329 // 330 // For each key code point, check which mapping tables it applies to, 331 // and create the final data for the key & value structures. 332 // 333 // The four logical mapping tables are conflated into one combined table. 334 // If multiple logical tables have the same mapping for some key, they 335 // share a single entry in the combined table. 336 // If more than one mapping exists for the same key code point, multiple 337 // entries will be created in the table 338 339 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) { 340 // It is an oddity of the UnicodeSet API that simply enumerating the contained 341 // code points requires a nested loop. 342 for (UChar32 keyChar=fKeySet->getRangeStart(range); 343 keyChar <= fKeySet->getRangeEnd(range); keyChar++) { 344 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar)); 345 U_ASSERT(targetMapping != NULL); 346 347 // Set an error code if trying to consume a long string. Otherwise, 348 // codePointAndLengthToKey will abort on a U_ASSERT. 349 if (targetMapping->fStr->length() > 256) { 350 status = U_ILLEGAL_ARGUMENT_ERROR; 351 return; 352 } 353 354 int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar, 355 targetMapping->fStr->length()); 356 int32_t value = targetMapping->fCharOrStrTableIndex; 357 358 fKeyVec->addElement(key, status); 359 fValueVec->addElement(value, status); 360 } 361 } 362 363 // Put the assembled data into the flat runtime array 364 outputData(status); 365 366 // All of the intermediate allocated data belongs to the ConfusabledataBuilder 367 // object (this), and is deleted in the destructor. 368 return; 369} 370 371// 372// outputData The confusable data has been compiled and stored in intermediate 373// collections and strings. Copy it from there to the final flat 374// binary array. 375// 376// Note that as each section is added to the output data, the 377// expand (reserveSpace() function will likely relocate it in memory. 378// Be careful with pointers. 379// 380void ConfusabledataBuilder::outputData(UErrorCode &status) { 381 382 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE); 383 384 // The Key Table 385 // While copying the keys to the runtime array, 386 // also sanity check that they are sorted. 387 388 int32_t numKeys = fKeyVec->size(); 389 int32_t *keys = 390 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status)); 391 if (U_FAILURE(status)) { 392 return; 393 } 394 int i; 395 UChar32 previousCodePoint = 0; 396 for (i=0; i<numKeys; i++) { 397 int32_t key = fKeyVec->elementAti(i); 398 UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key); 399 // strictly greater because there can be only one entry per code point 400 U_ASSERT(codePoint > previousCodePoint); 401 keys[i] = key; 402 previousCodePoint = codePoint; 403 } 404 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData; 405 rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData); 406 rawData->fCFUKeysSize = numKeys; 407 fSpoofImpl->fSpoofData->fCFUKeys = keys; 408 409 410 // The Value Table, parallels the key table 411 int32_t numValues = fValueVec->size(); 412 U_ASSERT(numKeys == numValues); 413 uint16_t *values = 414 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status)); 415 if (U_FAILURE(status)) { 416 return; 417 } 418 for (i=0; i<numValues; i++) { 419 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i)); 420 U_ASSERT(value < 0xffff); 421 values[i] = static_cast<uint16_t>(value); 422 } 423 rawData = fSpoofImpl->fSpoofData->fRawData; 424 rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData); 425 rawData->fCFUStringIndexSize = numValues; 426 fSpoofImpl->fSpoofData->fCFUValues = values; 427 428 // The Strings Table. 429 430 uint32_t stringsLength = fStringTable->length(); 431 // Reserve an extra space so the string will be nul-terminated. This is 432 // only a convenience, for when debugging; it is not needed otherwise. 433 UChar *strings = 434 static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status)); 435 if (U_FAILURE(status)) { 436 return; 437 } 438 fStringTable->extract(strings, stringsLength+1, status); 439 rawData = fSpoofImpl->fSpoofData->fRawData; 440 U_ASSERT(rawData->fCFUStringTable == 0); 441 rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData); 442 rawData->fCFUStringTableLen = stringsLength; 443 fSpoofImpl->fSpoofData->fCFUStrings = strings; 444} 445 446#endif 447#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 448 449