1/* 2******************************************************************************* 3* 4* Copyright (C) 2008, International Business Machines 5* Corporation, Google and others. All Rights Reserved. 6* 7******************************************************************************* 8*/ 9// Author : eldawy@google.com (Mohamed Eldawy) 10// ucnvsel.cpp 11// 12// Purpose: To generate a list of encodings capable of handling 13// a given Unicode text 14// 15// Started 09-April-2008 16 17/** 18 * \file 19 * 20 * This is an implementation of an encoding selector. 21 * The goal is, given a unicode string, find the encodings 22 * this string can be mapped to. To make processing faster 23 * a trie is built when you call ucnvsel_open() that 24 * stores all encodings a codepoint can map to 25 */ 26 27#include "unicode/ucnvsel.h" 28 29#include <string.h> 30 31#include "unicode/uchar.h" 32#include "unicode/uniset.h" 33#include "unicode/ucnv.h" 34#include "unicode/ustring.h" 35#include "unicode/uchriter.h" 36#include "utrie2.h" 37#include "propsvec.h" 38#include "uassert.h" 39#include "ucmndata.h" 40#include "uenumimp.h" 41#include "cmemory.h" 42#include "cstring.h" 43 44U_NAMESPACE_USE 45 46struct UConverterSelector { 47 UTrie2 *trie; // 16 bit trie containing offsets into pv 48 uint32_t* pv; // table of bits! 49 int32_t pvCount; 50 char** encodings; // which encodings did user ask to use? 51 int32_t encodingsCount; 52 int32_t encodingStrLength; 53 uint8_t* swapped; 54 UBool ownPv, ownEncodingStrings; 55}; 56 57static void generateSelectorData(UConverterSelector* result, 58 UPropsVectors *upvec, 59 const USet* excludedCodePoints, 60 const UConverterUnicodeSet whichSet, 61 UErrorCode* status) { 62 if (U_FAILURE(*status)) { 63 return; 64 } 65 66 int32_t columns = (result->encodingsCount+31)/32; 67 68 // set errorValue to all-ones 69 for (int32_t col = 0; col < columns; col++) { 70 upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, 71 col, ~0, ~0, status); 72 } 73 74 for (int32_t i = 0; i < result->encodingsCount; ++i) { 75 uint32_t mask; 76 uint32_t column; 77 int32_t item_count; 78 int32_t j; 79 UConverter* test_converter = ucnv_open(result->encodings[i], status); 80 if (U_FAILURE(*status)) { 81 return; 82 } 83 USet* unicode_point_set; 84 unicode_point_set = uset_open(1, 0); // empty set 85 86 ucnv_getUnicodeSet(test_converter, unicode_point_set, 87 whichSet, status); 88 if (U_FAILURE(*status)) { 89 ucnv_close(test_converter); 90 return; 91 } 92 93 column = i / 32; 94 mask = 1 << (i%32); 95 // now iterate over intervals on set i! 96 item_count = uset_getItemCount(unicode_point_set); 97 98 for (j = 0; j < item_count; ++j) { 99 UChar32 start_char; 100 UChar32 end_char; 101 UErrorCode smallStatus = U_ZERO_ERROR; 102 uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, 103 &smallStatus); 104 if (U_FAILURE(smallStatus)) { 105 // this will be reached for the converters that fill the set with 106 // strings. Those should be ignored by our system 107 } else { 108 upvec_setValue(upvec, start_char, end_char, column, ~0, mask, 109 status); 110 } 111 } 112 ucnv_close(test_converter); 113 uset_close(unicode_point_set); 114 if (U_FAILURE(*status)) { 115 return; 116 } 117 } 118 119 // handle excluded encodings! Simply set their values to all 1's in the upvec 120 if (excludedCodePoints) { 121 int32_t item_count = uset_getItemCount(excludedCodePoints); 122 for (int32_t j = 0; j < item_count; ++j) { 123 UChar32 start_char; 124 UChar32 end_char; 125 126 uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, 127 status); 128 for (int32_t col = 0; col < columns; col++) { 129 upvec_setValue(upvec, start_char, end_char, col, ~0, ~0, 130 status); 131 } 132 } 133 } 134 135 // alright. Now, let's put things in the same exact form you'd get when you 136 // unserialize things. 137 result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); 138 result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); 139 result->pvCount *= columns; // number of uint32_t = rows * columns 140 result->ownPv = TRUE; 141} 142 143/* open a selector. If converterListSize is 0, build for all converters. 144 If excludedCodePoints is NULL, don't exclude any codepoints */ 145U_CAPI UConverterSelector* U_EXPORT2 146ucnvsel_open(const char* const* converterList, int32_t converterListSize, 147 const USet* excludedCodePoints, 148 const UConverterUnicodeSet whichSet, UErrorCode* status) { 149 // check if already failed 150 if (U_FAILURE(*status)) { 151 return NULL; 152 } 153 // ensure args make sense! 154 if (converterListSize < 0 || (converterList == NULL && converterListSize != 0)) { 155 *status = U_ILLEGAL_ARGUMENT_ERROR; 156 return NULL; 157 } 158 159 // allocate a new converter 160 UConverterSelector* newSelector = 161 (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); 162 if (!newSelector) { 163 *status = U_MEMORY_ALLOCATION_ERROR; 164 return NULL; 165 } 166 uprv_memset(newSelector, 0, sizeof(UConverterSelector)); 167 168 if (converterListSize == 0) { 169 converterList = NULL; 170 converterListSize = ucnv_countAvailable(); 171 } 172 newSelector->encodings = 173 (char**)uprv_malloc(converterListSize * sizeof(char*)); 174 if (!newSelector->encodings) { 175 *status = U_MEMORY_ALLOCATION_ERROR; 176 uprv_free(newSelector); 177 return NULL; 178 } 179 newSelector->encodings[0] = NULL; // now we can call ucnvsel_close() 180 181 // make a backup copy of the list of converters 182 int32_t totalSize = 0; 183 int32_t i; 184 for (i = 0; i < converterListSize; i++) { 185 totalSize += 186 uprv_strlen(converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)) + 1; 187 } 188 // 4-align the totalSize to 4-align the size of the serialized form 189 int32_t encodingStrPadding = totalSize & 3; 190 if (encodingStrPadding != 0) { 191 encodingStrPadding = 4 - encodingStrPadding; 192 } 193 newSelector->encodingStrLength = totalSize += encodingStrPadding; 194 char* allStrings = (char*) uprv_malloc(totalSize); 195 if (!allStrings) { 196 *status = U_MEMORY_ALLOCATION_ERROR; 197 ucnvsel_close(newSelector); 198 return NULL; 199 } 200 201 for (i = 0; i < converterListSize; i++) { 202 newSelector->encodings[i] = allStrings; 203 uprv_strcpy(newSelector->encodings[i], 204 converterList != NULL ? converterList[i] : ucnv_getAvailableName(i)); 205 allStrings += uprv_strlen(newSelector->encodings[i]) + 1; 206 } 207 while (encodingStrPadding > 0) { 208 *allStrings++ = 0; 209 --encodingStrPadding; 210 } 211 212 newSelector->ownEncodingStrings = TRUE; 213 newSelector->encodingsCount = converterListSize; 214 UPropsVectors *upvec = upvec_open((converterListSize+31)/32, status); 215 generateSelectorData(newSelector, upvec, excludedCodePoints, whichSet, status); 216 upvec_close(upvec); 217 218 if (U_FAILURE(*status)) { 219 ucnvsel_close(newSelector); 220 return NULL; 221 } 222 223 return newSelector; 224} 225 226/* close opened selector */ 227U_CAPI void U_EXPORT2 228ucnvsel_close(UConverterSelector *sel) { 229 if (!sel) { 230 return; 231 } 232 if (sel->ownEncodingStrings) { 233 uprv_free(sel->encodings[0]); 234 } 235 uprv_free(sel->encodings); 236 if (sel->ownPv) { 237 uprv_free(sel->pv); 238 } 239 utrie2_close(sel->trie); 240 uprv_free(sel->swapped); 241 uprv_free(sel); 242} 243 244static const UDataInfo dataInfo = { 245 sizeof(UDataInfo), 246 0, 247 248 U_IS_BIG_ENDIAN, 249 U_CHARSET_FAMILY, 250 U_SIZEOF_UCHAR, 251 0, 252 253 { 0x43, 0x53, 0x65, 0x6c }, /* dataFormat="CSel" */ 254 { 1, 0, 0, 0 }, /* formatVersion */ 255 { 0, 0, 0, 0 } /* dataVersion */ 256}; 257 258enum { 259 UCNVSEL_INDEX_TRIE_SIZE, // trie size in bytes 260 UCNVSEL_INDEX_PV_COUNT, // number of uint32_t in the bit vectors 261 UCNVSEL_INDEX_NAMES_COUNT, // number of encoding names 262 UCNVSEL_INDEX_NAMES_LENGTH, // number of encoding name bytes including padding 263 UCNVSEL_INDEX_SIZE = 15, // bytes following the DataHeader 264 UCNVSEL_INDEX_COUNT = 16 265}; 266 267/* 268 * Serialized form of a UConverterSelector, formatVersion 1: 269 * 270 * The serialized form begins with a standard ICU DataHeader with a UDataInfo 271 * as the template above. 272 * This is followed by: 273 * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above 274 * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes 275 * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors 276 * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding 277 */ 278 279/* serialize a selector */ 280U_CAPI int32_t U_EXPORT2 281ucnvsel_serialize(const UConverterSelector* sel, 282 void* buffer, int32_t bufferCapacity, UErrorCode* status) { 283 // check if already failed 284 if (U_FAILURE(*status)) { 285 return 0; 286 } 287 // ensure args make sense! 288 uint8_t *p = (uint8_t *)buffer; 289 if (bufferCapacity < 0 || 290 (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 291 ) { 292 *status = U_ILLEGAL_ARGUMENT_ERROR; 293 return 0; 294 } 295 // add up the size of the serialized form 296 int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); 297 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { 298 return 0; 299 } 300 *status = U_ZERO_ERROR; 301 302 DataHeader header; 303 uprv_memset(&header, 0, sizeof(header)); 304 header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); 305 header.dataHeader.magic1 = 0xda; 306 header.dataHeader.magic2 = 0x27; 307 uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); 308 309 int32_t indexes[UCNVSEL_INDEX_COUNT] = { 310 serializedTrieSize, 311 sel->pvCount, 312 sel->encodingsCount, 313 sel->encodingStrLength 314 }; 315 316 int32_t totalSize = 317 header.dataHeader.headerSize + 318 (int32_t)sizeof(indexes) + 319 serializedTrieSize + 320 sel->pvCount * 4 + 321 sel->encodingStrLength; 322 indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; 323 if (totalSize > bufferCapacity) { 324 *status = U_BUFFER_OVERFLOW_ERROR; 325 return totalSize; 326 } 327 // ok, save! 328 int32_t length = header.dataHeader.headerSize; 329 uprv_memcpy(p, &header, sizeof(header)); 330 uprv_memset(p + sizeof(header), 0, length - sizeof(header)); 331 p += length; 332 333 length = (int32_t)sizeof(indexes); 334 uprv_memcpy(p, indexes, length); 335 p += length; 336 337 utrie2_serialize(sel->trie, p, serializedTrieSize, status); 338 p += serializedTrieSize; 339 340 length = sel->pvCount * 4; 341 uprv_memcpy(p, sel->pv, length); 342 p += length; 343 344 uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); 345 p += sel->encodingStrLength; 346 347 return totalSize; 348} 349 350/** 351 * swap a selector into the desired Endianness and Asciiness of 352 * the system. Just as FYI, selectors are always saved in the format 353 * of the system that created them. They are only converted if used 354 * on another system. In other words, selectors created on different 355 * system can be different even if the params are identical (endianness 356 * and Asciiness differences only) 357 * 358 * @param ds pointer to data swapper containing swapping info 359 * @param inData pointer to incoming data 360 * @param length length of inData in bytes 361 * @param outData pointer to output data. Capacity should 362 * be at least equal to capacity of inData 363 * @param status an in/out ICU UErrorCode 364 * @return 0 on failure, number of bytes swapped on success 365 * number of bytes swapped can be smaller than length 366 */ 367static int32_t 368ucnvsel_swap(const UDataSwapper *ds, 369 const void *inData, int32_t length, 370 void *outData, UErrorCode *status) { 371 /* udata_swapDataHeader checks the arguments */ 372 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, status); 373 if(U_FAILURE(*status)) { 374 return 0; 375 } 376 377 /* check data format and format version */ 378 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); 379 if(!( 380 pInfo->dataFormat[0] == 0x43 && /* dataFormat="CSel" */ 381 pInfo->dataFormat[1] == 0x53 && 382 pInfo->dataFormat[2] == 0x65 && 383 pInfo->dataFormat[3] == 0x6c 384 )) { 385 udata_printError(ds, "ucnvsel_swap(): data format %02x.%02x.%02x.%02x is not recognized as UConverterSelector data\n", 386 pInfo->dataFormat[0], pInfo->dataFormat[1], 387 pInfo->dataFormat[2], pInfo->dataFormat[3]); 388 *status = U_INVALID_FORMAT_ERROR; 389 return 0; 390 } 391 if(pInfo->formatVersion[0] != 1) { 392 udata_printError(ds, "ucnvsel_swap(): format version %02x is not supported\n", 393 pInfo->formatVersion[0]); 394 *status = U_UNSUPPORTED_ERROR; 395 return 0; 396 } 397 398 if(length >= 0) { 399 length -= headerSize; 400 if(length < 16*4) { 401 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for UConverterSelector data\n", 402 length); 403 *status = U_INDEX_OUTOFBOUNDS_ERROR; 404 return 0; 405 } 406 } 407 408 const uint8_t *inBytes = (const uint8_t *)inData + headerSize; 409 uint8_t *outBytes = (uint8_t *)outData + headerSize; 410 411 /* read the indexes */ 412 const int32_t *inIndexes = (const int32_t *)inBytes; 413 int32_t indexes[16]; 414 int32_t i; 415 for(i = 0; i < 16; ++i) { 416 indexes[i] = udata_readInt32(ds, inIndexes[i]); 417 } 418 419 /* get the total length of the data */ 420 int32_t size = indexes[UCNVSEL_INDEX_SIZE]; 421 if(length >= 0) { 422 if(length < size) { 423 udata_printError(ds, "ucnvsel_swap(): too few bytes (%d after header) for all of UConverterSelector data\n", 424 length); 425 *status = U_INDEX_OUTOFBOUNDS_ERROR; 426 return 0; 427 } 428 429 /* copy the data for inaccessible bytes */ 430 if(inBytes != outBytes) { 431 uprv_memcpy(outBytes, inBytes, size); 432 } 433 434 int32_t offset = 0, count; 435 436 /* swap the int32_t indexes[] */ 437 count = UCNVSEL_INDEX_COUNT*4; 438 ds->swapArray32(ds, inBytes, count, outBytes, status); 439 offset += count; 440 441 /* swap the UTrie2 */ 442 count = indexes[UCNVSEL_INDEX_TRIE_SIZE]; 443 utrie2_swap(ds, inBytes + offset, count, outBytes + offset, status); 444 offset += count; 445 446 /* swap the uint32_t pv[] */ 447 count = indexes[UCNVSEL_INDEX_PV_COUNT]*4; 448 ds->swapArray32(ds, inBytes + offset, count, outBytes + offset, status); 449 offset += count; 450 451 /* swap the encoding names */ 452 count = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 453 ds->swapInvChars(ds, inBytes + offset, count, outBytes + offset, status); 454 offset += count; 455 456 U_ASSERT(offset == size); 457 } 458 459 return headerSize + size; 460} 461 462/* unserialize a selector */ 463U_CAPI UConverterSelector* U_EXPORT2 464ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { 465 // check if already failed 466 if (U_FAILURE(*status)) { 467 return NULL; 468 } 469 // ensure args make sense! 470 const uint8_t *p = (const uint8_t *)buffer; 471 if (length <= 0 || 472 (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) 473 ) { 474 *status = U_ILLEGAL_ARGUMENT_ERROR; 475 return NULL; 476 } 477 // header 478 if (length < 32) { 479 // not even enough space for a minimal header 480 *status = U_INDEX_OUTOFBOUNDS_ERROR; 481 return NULL; 482 } 483 const DataHeader *pHeader = (const DataHeader *)p; 484 if (!( 485 pHeader->dataHeader.magic1==0xda && 486 pHeader->dataHeader.magic2==0x27 && 487 pHeader->info.dataFormat[0] == 0x43 && 488 pHeader->info.dataFormat[1] == 0x53 && 489 pHeader->info.dataFormat[2] == 0x65 && 490 pHeader->info.dataFormat[3] == 0x6c 491 )) { 492 /* header not valid or dataFormat not recognized */ 493 *status = U_INVALID_FORMAT_ERROR; 494 return NULL; 495 } 496 if (pHeader->info.formatVersion[0] != 1) { 497 *status = U_UNSUPPORTED_ERROR; 498 return NULL; 499 } 500 uint8_t* swapped = NULL; 501 if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || 502 pHeader->info.charsetFamily != U_CHARSET_FAMILY 503 ) { 504 // swap the data 505 UDataSwapper *ds = 506 udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); 507 int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); 508 if (U_FAILURE(*status)) { 509 udata_closeSwapper(ds); 510 return NULL; 511 } 512 if (length < totalSize) { 513 udata_closeSwapper(ds); 514 *status = U_INDEX_OUTOFBOUNDS_ERROR; 515 return NULL; 516 } 517 swapped = (uint8_t*)uprv_malloc(totalSize); 518 if (swapped == NULL) { 519 udata_closeSwapper(ds); 520 *status = U_MEMORY_ALLOCATION_ERROR; 521 return NULL; 522 } 523 ucnvsel_swap(ds, p, length, swapped, status); 524 udata_closeSwapper(ds); 525 if (U_FAILURE(*status)) { 526 uprv_free(swapped); 527 return NULL; 528 } 529 p = swapped; 530 pHeader = (const DataHeader *)p; 531 } 532 if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { 533 // not even enough space for the header and the indexes 534 uprv_free(swapped); 535 *status = U_INDEX_OUTOFBOUNDS_ERROR; 536 return NULL; 537 } 538 p += pHeader->dataHeader.headerSize; 539 length -= pHeader->dataHeader.headerSize; 540 // indexes 541 const int32_t *indexes = (const int32_t *)p; 542 if (length < indexes[UCNVSEL_INDEX_SIZE]) { 543 uprv_free(swapped); 544 *status = U_INDEX_OUTOFBOUNDS_ERROR; 545 return NULL; 546 } 547 p += UCNVSEL_INDEX_COUNT * 4; 548 // create and populate the selector object 549 UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); 550 char **encodings = 551 (char **)uprv_malloc( 552 indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); 553 if (sel == NULL || encodings == NULL) { 554 uprv_free(swapped); 555 uprv_free(sel); 556 uprv_free(encodings); 557 *status = U_MEMORY_ALLOCATION_ERROR; 558 return NULL; 559 } 560 uprv_memset(sel, 0, sizeof(UConverterSelector)); 561 sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; 562 sel->encodings = encodings; 563 sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; 564 sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; 565 sel->swapped = swapped; 566 // trie 567 sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 568 p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, 569 status); 570 p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; 571 if (U_FAILURE(*status)) { 572 ucnvsel_close(sel); 573 return NULL; 574 } 575 // bit vectors 576 sel->pv = (uint32_t *)p; 577 p += sel->pvCount * 4; 578 // encoding names 579 char* s = (char*)p; 580 for (int32_t i = 0; i < sel->encodingsCount; ++i) { 581 sel->encodings[i] = s; 582 s += uprv_strlen(s) + 1; 583 } 584 p += sel->encodingStrLength; 585 586 return sel; 587} 588 589// a bunch of functions for the enumeration thingie! Nothing fancy here. Just 590// iterate over the selected encodings 591struct Enumerator { 592 int16_t* index; 593 int16_t length; 594 int16_t cur; 595 const UConverterSelector* sel; 596}; 597 598U_CDECL_BEGIN 599 600static void U_CALLCONV 601ucnvsel_close_selector_iterator(UEnumeration *enumerator) { 602 uprv_free(((Enumerator*)(enumerator->context))->index); 603 uprv_free(enumerator->context); 604 uprv_free(enumerator); 605} 606 607 608static int32_t U_CALLCONV 609ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { 610 // check if already failed 611 if (U_FAILURE(*status)) { 612 return 0; 613 } 614 return ((Enumerator*)(enumerator->context))->length; 615} 616 617 618static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, 619 int32_t* resultLength, 620 UErrorCode* status) { 621 // check if already failed 622 if (U_FAILURE(*status)) { 623 return NULL; 624 } 625 626 int16_t cur = ((Enumerator*)(enumerator->context))->cur; 627 const UConverterSelector* sel; 628 const char* result; 629 if (cur >= ((Enumerator*)(enumerator->context))->length) { 630 return NULL; 631 } 632 sel = ((Enumerator*)(enumerator->context))->sel; 633 result = sel->encodings[((Enumerator*)(enumerator->context))->index[cur] ]; 634 ((Enumerator*)(enumerator->context))->cur++; 635 if (resultLength) { 636 *resultLength = uprv_strlen(result); 637 } 638 return result; 639} 640 641static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, 642 UErrorCode* status) { 643 // check if already failed 644 if (U_FAILURE(*status)) { 645 return ; 646 } 647 ((Enumerator*)(enumerator->context))->cur = 0; 648} 649 650U_CDECL_END 651 652 653static const UEnumeration defaultEncodings = { 654 NULL, 655 NULL, 656 ucnvsel_close_selector_iterator, 657 ucnvsel_count_encodings, 658 uenum_unextDefault, 659 ucnvsel_next_encoding, 660 ucnvsel_reset_iterator 661}; 662 663 664// internal fn to intersect two sets of masks 665// returns whether the mask has reduced to all zeros 666UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { 667 int32_t i; 668 uint32_t oredDest = 0; 669 for (i = 0 ; i < len ; ++i) { 670 oredDest |= (dest[i] &= source1[i]); 671 } 672 return oredDest == 0; 673} 674 675// internal fn to count how many 1's are there in a mask 676// algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html 677int16_t countOnes(uint32_t* mask, int32_t len) { 678 int32_t i, totalOnes = 0; 679 for (i = 0 ; i < len ; ++i) { 680 uint32_t ent = mask[i]; 681 for (; ent; totalOnes++) 682 { 683 ent &= ent - 1; // clear the least significant bit set 684 } 685 } 686 return totalOnes; 687} 688 689 690/* internal function! */ 691static UEnumeration *selectForMask(const UConverterSelector* sel, 692 uint32_t *mask, UErrorCode *status) { 693 // this is the context we will use. Store a table of indices to which 694 // encodings are legit. 695 struct Enumerator* result = (Enumerator*)uprv_malloc(sizeof(Enumerator)); 696 if (result == NULL) { 697 uprv_free(mask); 698 *status = U_MEMORY_ALLOCATION_ERROR; 699 return NULL; 700 } 701 result->index = NULL; // this will be allocated later! 702 result->length = result->cur = 0; 703 result->sel = sel; 704 705 UEnumeration *en = (UEnumeration *)uprv_malloc(sizeof(UEnumeration)); 706 if (en == NULL) { 707 // TODO(markus): Combine Enumerator and UEnumeration into one struct. 708 uprv_free(mask); 709 uprv_free(result); 710 *status = U_MEMORY_ALLOCATION_ERROR; 711 return NULL; 712 } 713 memcpy(en, &defaultEncodings, sizeof(UEnumeration)); 714 en->context = result; 715 716 int32_t columns = (sel->encodingsCount+31)/32; 717 int16_t numOnes = countOnes(mask, columns); 718 // now, we know the exact space we need for index 719 if (numOnes > 0) { 720 result->index = (int16_t*) uprv_malloc(numOnes * sizeof(int16_t)); 721 722 int32_t i, j; 723 int16_t k = 0; 724 for (j = 0 ; j < columns; j++) { 725 uint32_t v = mask[j]; 726 for (i = 0 ; i < 32 && k < sel->encodingsCount; i++, k++) { 727 if ((v & 1) != 0) { 728 result->index[result->length++] = k; 729 } 730 v >>= 1; 731 } 732 } 733 } //otherwise, index will remain NULL (and will never be touched by 734 //the enumerator code anyway) 735 uprv_free(mask); 736 return en; 737} 738 739/* check a string against the selector - UTF16 version */ 740U_CAPI UEnumeration * U_EXPORT2 741ucnvsel_selectForString(const UConverterSelector* sel, 742 const UChar *s, int32_t length, UErrorCode *status) { 743 // check if already failed 744 if (U_FAILURE(*status)) { 745 return NULL; 746 } 747 // ensure args make sense! 748 if (sel == NULL || (s == NULL && length != 0)) { 749 *status = U_ILLEGAL_ARGUMENT_ERROR; 750 return NULL; 751 } 752 753 int32_t columns = (sel->encodingsCount+31)/32; 754 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 755 if (mask == NULL) { 756 *status = U_MEMORY_ALLOCATION_ERROR; 757 return NULL; 758 } 759 uprv_memset(mask, ~0, columns *4); 760 761 const UChar *limit; 762 if (length >= 0) { 763 limit = s + length; 764 } else { 765 limit = NULL; 766 } 767 768 while (limit == NULL ? *s != 0 : s != limit) { 769 UChar32 c; 770 uint16_t pvIndex; 771 UTRIE2_U16_NEXT16(sel->trie, s, limit, c, pvIndex); 772 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 773 break; 774 } 775 } 776 return selectForMask(sel, mask, status); 777} 778 779/* check a string against the selector - UTF8 version */ 780U_CAPI UEnumeration * U_EXPORT2 781ucnvsel_selectForUTF8(const UConverterSelector* sel, 782 const char *s, int32_t length, UErrorCode *status) { 783 // check if already failed 784 if (U_FAILURE(*status)) { 785 return NULL; 786 } 787 // ensure args make sense! 788 if (sel == NULL || (s == NULL && length != 0)) { 789 *status = U_ILLEGAL_ARGUMENT_ERROR; 790 return NULL; 791 } 792 793 int32_t columns = (sel->encodingsCount+31)/32; 794 uint32_t* mask = (uint32_t*) uprv_malloc(columns * 4); 795 if (mask == NULL) { 796 *status = U_MEMORY_ALLOCATION_ERROR; 797 return NULL; 798 } 799 uprv_memset(mask, ~0, columns *4); 800 801 if (length < 0) { 802 length = uprv_strlen(s); 803 } 804 const char *limit = s + length; 805 806 while (s != limit) { 807 uint16_t pvIndex; 808 UTRIE2_U8_NEXT16(sel->trie, s, limit, pvIndex); 809 if (intersectMasks(mask, sel->pv+pvIndex, columns)) { 810 break; 811 } 812 } 813 return selectForMask(sel, mask, status); 814} 815