1/* 2******************************************************************************* 3* 4* Copyright (C) 2002-2010, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: uset.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2002mar07 14* created by: Markus W. Scherer 15* 16* There are functions to efficiently serialize a USet into an array of uint16_t 17* and functions to use such a serialized form efficiently without 18* instantiating a new USet. 19*/ 20 21#include "unicode/utypes.h" 22#include "unicode/uobject.h" 23#include "unicode/uset.h" 24#include "unicode/uniset.h" 25#include "cmemory.h" 26#include "unicode/ustring.h" 27#include "unicode/parsepos.h" 28 29U_NAMESPACE_USE 30 31U_CAPI USet* U_EXPORT2 32uset_openEmpty() { 33 return (USet*) new UnicodeSet(); 34} 35 36U_CAPI USet* U_EXPORT2 37uset_open(UChar32 start, UChar32 end) { 38 return (USet*) new UnicodeSet(start, end); 39} 40 41U_CAPI void U_EXPORT2 42uset_close(USet* set) { 43 delete (UnicodeSet*) set; 44} 45 46U_CAPI USet * U_EXPORT2 47uset_clone(const USet *set) { 48 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone()); 49} 50 51U_CAPI UBool U_EXPORT2 52uset_isFrozen(const USet *set) { 53 return ((UnicodeSet*) set)->UnicodeSet::isFrozen(); 54} 55 56U_CAPI void U_EXPORT2 57uset_freeze(USet *set) { 58 ((UnicodeSet*) set)->UnicodeSet::freeze(); 59} 60 61U_CAPI USet * U_EXPORT2 62uset_cloneAsThawed(const USet *set) { 63 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed()); 64} 65 66U_CAPI void U_EXPORT2 67uset_set(USet* set, 68 UChar32 start, UChar32 end) { 69 ((UnicodeSet*) set)->UnicodeSet::set(start, end); 70} 71 72U_CAPI void U_EXPORT2 73uset_addAll(USet* set, const USet *additionalSet) { 74 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet)); 75} 76 77U_CAPI void U_EXPORT2 78uset_add(USet* set, UChar32 c) { 79 ((UnicodeSet*) set)->UnicodeSet::add(c); 80} 81 82U_CAPI void U_EXPORT2 83uset_addRange(USet* set, UChar32 start, UChar32 end) { 84 ((UnicodeSet*) set)->UnicodeSet::add(start, end); 85} 86 87U_CAPI void U_EXPORT2 88uset_addString(USet* set, const UChar* str, int32_t strLen) { 89 // UnicodeString handles -1 for strLen 90 UnicodeString s(strLen<0, str, strLen); 91 ((UnicodeSet*) set)->UnicodeSet::add(s); 92} 93 94U_CAPI void U_EXPORT2 95uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) { 96 // UnicodeString handles -1 for strLen 97 UnicodeString s(str, strLen); 98 ((UnicodeSet*) set)->UnicodeSet::addAll(s); 99} 100 101U_CAPI void U_EXPORT2 102uset_remove(USet* set, UChar32 c) { 103 ((UnicodeSet*) set)->UnicodeSet::remove(c); 104} 105 106U_CAPI void U_EXPORT2 107uset_removeRange(USet* set, UChar32 start, UChar32 end) { 108 ((UnicodeSet*) set)->UnicodeSet::remove(start, end); 109} 110 111U_CAPI void U_EXPORT2 112uset_removeString(USet* set, const UChar* str, int32_t strLen) { 113 UnicodeString s(strLen==-1, str, strLen); 114 ((UnicodeSet*) set)->UnicodeSet::remove(s); 115} 116 117U_CAPI void U_EXPORT2 118uset_removeAll(USet* set, const USet* remove) { 119 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove); 120} 121 122U_CAPI void U_EXPORT2 123uset_retain(USet* set, UChar32 start, UChar32 end) { 124 ((UnicodeSet*) set)->UnicodeSet::retain(start, end); 125} 126 127U_CAPI void U_EXPORT2 128uset_retainAll(USet* set, const USet* retain) { 129 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain); 130} 131 132U_CAPI void U_EXPORT2 133uset_compact(USet* set) { 134 ((UnicodeSet*) set)->UnicodeSet::compact(); 135} 136 137U_CAPI void U_EXPORT2 138uset_complement(USet* set) { 139 ((UnicodeSet*) set)->UnicodeSet::complement(); 140} 141 142U_CAPI void U_EXPORT2 143uset_complementAll(USet* set, const USet* complement) { 144 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement); 145} 146 147U_CAPI void U_EXPORT2 148uset_clear(USet* set) { 149 ((UnicodeSet*) set)->UnicodeSet::clear(); 150} 151 152U_CAPI void U_EXPORT2 153uset_closeOver(USet* set, int32_t attributes) { 154 ((UnicodeSet*) set)->UnicodeSet::closeOver(attributes); 155} 156 157U_CAPI void U_EXPORT2 158uset_removeAllStrings(USet* set) { 159 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings(); 160} 161 162U_CAPI UBool U_EXPORT2 163uset_isEmpty(const USet* set) { 164 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); 165} 166 167U_CAPI UBool U_EXPORT2 168uset_contains(const USet* set, UChar32 c) { 169 return ((const UnicodeSet*) set)->UnicodeSet::contains(c); 170} 171 172U_CAPI UBool U_EXPORT2 173uset_containsRange(const USet* set, UChar32 start, UChar32 end) { 174 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end); 175} 176 177U_CAPI UBool U_EXPORT2 178uset_containsString(const USet* set, const UChar* str, int32_t strLen) { 179 UnicodeString s(strLen==-1, str, strLen); 180 return ((const UnicodeSet*) set)->UnicodeSet::contains(s); 181} 182 183U_CAPI UBool U_EXPORT2 184uset_containsAll(const USet* set1, const USet* set2) { 185 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2); 186} 187 188U_CAPI UBool U_EXPORT2 189uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) { 190 // Create a string alias, since nothing is being added to the set. 191 UnicodeString s(strLen==-1, str, strLen); 192 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s); 193} 194 195U_CAPI UBool U_EXPORT2 196uset_containsNone(const USet* set1, const USet* set2) { 197 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2); 198} 199 200U_CAPI UBool U_EXPORT2 201uset_containsSome(const USet* set1, const USet* set2) { 202 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2); 203} 204 205U_CAPI int32_t U_EXPORT2 206uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 207 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition); 208} 209 210U_CAPI int32_t U_EXPORT2 211uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 212 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition); 213} 214 215U_CAPI int32_t U_EXPORT2 216uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 217 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition); 218} 219 220U_CAPI int32_t U_EXPORT2 221uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 222 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition); 223} 224 225U_CAPI UBool U_EXPORT2 226uset_equals(const USet* set1, const USet* set2) { 227 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; 228} 229 230U_CAPI int32_t U_EXPORT2 231uset_indexOf(const USet* set, UChar32 c) { 232 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c); 233} 234 235U_CAPI UChar32 U_EXPORT2 236uset_charAt(const USet* set, int32_t index) { 237 return ((UnicodeSet*) set)->UnicodeSet::charAt(index); 238} 239 240U_CAPI int32_t U_EXPORT2 241uset_size(const USet* set) { 242 return ((const UnicodeSet*) set)->UnicodeSet::size(); 243} 244 245U_NAMESPACE_BEGIN 246/** 247 * This class only exists to provide access to the UnicodeSet private 248 * USet support API. Declaring a class a friend is more portable than 249 * trying to declare extern "C" functions as friends. 250 */ 251class USetAccess /* not : public UObject because all methods are static */ { 252public: 253 /* Try to have the compiler inline these*/ 254 inline static int32_t getStringCount(const UnicodeSet& set) { 255 return set.getStringCount(); 256 } 257 inline static const UnicodeString* getString(const UnicodeSet& set, 258 int32_t i) { 259 return set.getString(i); 260 } 261private: 262 /* do not instantiate*/ 263 USetAccess(); 264}; 265U_NAMESPACE_END 266 267U_CAPI int32_t U_EXPORT2 268uset_getItemCount(const USet* uset) { 269 const UnicodeSet& set = *(const UnicodeSet*)uset; 270 return set.getRangeCount() + USetAccess::getStringCount(set); 271} 272 273U_CAPI int32_t U_EXPORT2 274uset_getItem(const USet* uset, int32_t itemIndex, 275 UChar32* start, UChar32* end, 276 UChar* str, int32_t strCapacity, 277 UErrorCode* ec) { 278 if (U_FAILURE(*ec)) return 0; 279 const UnicodeSet& set = *(const UnicodeSet*)uset; 280 int32_t rangeCount; 281 282 if (itemIndex < 0) { 283 *ec = U_ILLEGAL_ARGUMENT_ERROR; 284 return -1; 285 } else if (itemIndex < (rangeCount = set.getRangeCount())) { 286 *start = set.getRangeStart(itemIndex); 287 *end = set.getRangeEnd(itemIndex); 288 return 0; 289 } else { 290 itemIndex -= rangeCount; 291 if (itemIndex < USetAccess::getStringCount(set)) { 292 const UnicodeString* s = USetAccess::getString(set, itemIndex); 293 return s->extract(str, strCapacity, *ec); 294 } else { 295 *ec = U_INDEX_OUTOFBOUNDS_ERROR; 296 return -1; 297 } 298 } 299} 300 301//U_CAPI int32_t U_EXPORT2 302//uset_getRangeCount(const USet* set) { 303// return ((const UnicodeSet*) set)->getRangeCount(); 304//} 305// 306//U_CAPI UBool U_EXPORT2 307//uset_getRange(const USet* set, int32_t rangeIndex, 308// UChar32* pStart, UChar32* pEnd) { 309// if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { 310// return FALSE; 311// } 312// const UnicodeSet* us = (const UnicodeSet*) set; 313// *pStart = us->getRangeStart(rangeIndex); 314// *pEnd = us->getRangeEnd(rangeIndex); 315// return TRUE; 316//} 317 318/* 319 * Serialize a USet into 16-bit units. 320 * Store BMP code points as themselves with one 16-bit unit each. 321 * 322 * Important: the code points in the array are in ascending order, 323 * therefore all BMP code points precede all supplementary code points. 324 * 325 * Store each supplementary code point in 2 16-bit units, 326 * simply with higher-then-lower 16-bit halfs. 327 * 328 * Precede the entire list with the length. 329 * If there are supplementary code points, then set bit 15 in the length 330 * and add the bmpLength between it and the array. 331 * 332 * In other words: 333 * - all BMP: (length=bmpLength) BMP, .., BMP 334 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, .. 335 */ 336U_CAPI int32_t U_EXPORT2 337uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) { 338 if (ec==NULL || U_FAILURE(*ec)) { 339 return 0; 340 } 341 342 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec); 343} 344 345U_CAPI UBool U_EXPORT2 346uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) { 347 int32_t length; 348 349 if(fillSet==NULL) { 350 return FALSE; 351 } 352 if(src==NULL || srcLength<=0) { 353 fillSet->length=fillSet->bmpLength=0; 354 return FALSE; 355 } 356 357 length=*src++; 358 if(length&0x8000) { 359 /* there are supplementary values */ 360 length&=0x7fff; 361 if(srcLength<(2+length)) { 362 fillSet->length=fillSet->bmpLength=0; 363 return FALSE; 364 } 365 fillSet->bmpLength=*src++; 366 } else { 367 /* only BMP values */ 368 if(srcLength<(1+length)) { 369 fillSet->length=fillSet->bmpLength=0; 370 return FALSE; 371 } 372 fillSet->bmpLength=length; 373 } 374 fillSet->array=src; 375 fillSet->length=length; 376 return TRUE; 377} 378 379U_CAPI void U_EXPORT2 380uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) { 381 if(fillSet==NULL || (uint32_t)c>0x10ffff) { 382 return; 383 } 384 385 fillSet->array=fillSet->staticArray; 386 if(c<0xffff) { 387 fillSet->bmpLength=fillSet->length=2; 388 fillSet->staticArray[0]=(uint16_t)c; 389 fillSet->staticArray[1]=(uint16_t)c+1; 390 } else if(c==0xffff) { 391 fillSet->bmpLength=1; 392 fillSet->length=3; 393 fillSet->staticArray[0]=0xffff; 394 fillSet->staticArray[1]=1; 395 fillSet->staticArray[2]=0; 396 } else if(c<0x10ffff) { 397 fillSet->bmpLength=0; 398 fillSet->length=4; 399 fillSet->staticArray[0]=(uint16_t)(c>>16); 400 fillSet->staticArray[1]=(uint16_t)c; 401 ++c; 402 fillSet->staticArray[2]=(uint16_t)(c>>16); 403 fillSet->staticArray[3]=(uint16_t)c; 404 } else /* c==0x10ffff */ { 405 fillSet->bmpLength=0; 406 fillSet->length=2; 407 fillSet->staticArray[0]=0x10; 408 fillSet->staticArray[1]=0xffff; 409 } 410} 411 412U_CAPI UBool U_EXPORT2 413uset_serializedContains(const USerializedSet* set, UChar32 c) { 414 const uint16_t* array; 415 416 if(set==NULL || (uint32_t)c>0x10ffff) { 417 return FALSE; 418 } 419 420 array=set->array; 421 if(c<=0xffff) { 422 /* find c in the BMP part */ 423 int32_t lo = 0; 424 int32_t hi = set->bmpLength-1; 425 if (c < array[0]) { 426 hi = 0; 427 } else if (c < array[hi]) { 428 for(;;) { 429 int32_t i = (lo + hi) >> 1; 430 if (i == lo) { 431 break; // Done! 432 } else if (c < array[i]) { 433 hi = i; 434 } else { 435 lo = i; 436 } 437 } 438 } else { 439 hi += 1; 440 } 441 return (UBool)(hi&1); 442 } else { 443 /* find c in the supplementary part */ 444 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c; 445 int32_t base = set->bmpLength; 446 int32_t lo = 0; 447 int32_t hi = set->length - 2 - base; 448 if (high < array[base] || (high==array[base] && low<array[base+1])) { 449 hi = 0; 450 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) { 451 for (;;) { 452 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result 453 int32_t iabs = i + base; 454 if (i == lo) { 455 break; // Done! 456 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) { 457 hi = i; 458 } else { 459 lo = i; 460 } 461 } 462 } else { 463 hi += 2; 464 } 465 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */ 466 return (UBool)(((hi+(base<<1))&2)!=0); 467 } 468} 469 470U_CAPI int32_t U_EXPORT2 471uset_getSerializedRangeCount(const USerializedSet* set) { 472 if(set==NULL) { 473 return 0; 474 } 475 476 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2; 477} 478 479U_CAPI UBool U_EXPORT2 480uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 481 UChar32* pStart, UChar32* pEnd) { 482 const uint16_t* array; 483 int32_t bmpLength, length; 484 485 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) { 486 return FALSE; 487 } 488 489 array=set->array; 490 length=set->length; 491 bmpLength=set->bmpLength; 492 493 rangeIndex*=2; /* address start/limit pairs */ 494 if(rangeIndex<bmpLength) { 495 *pStart=array[rangeIndex++]; 496 if(rangeIndex<bmpLength) { 497 *pEnd=array[rangeIndex]-1; 498 } else if(rangeIndex<length) { 499 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 500 } else { 501 *pEnd=0x10ffff; 502 } 503 return TRUE; 504 } else { 505 rangeIndex-=bmpLength; 506 rangeIndex*=2; /* address pairs of pairs of units */ 507 length-=bmpLength; 508 if(rangeIndex<length) { 509 array+=bmpLength; 510 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; 511 rangeIndex+=2; 512 if(rangeIndex<length) { 513 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 514 } else { 515 *pEnd=0x10ffff; 516 } 517 return TRUE; 518 } else { 519 return FALSE; 520 } 521 } 522} 523 524// TODO The old, internal uset.c had an efficient uset_containsOne function. 525// Returned the one and only code point, or else -1 or something. 526// Consider adding such a function to both C and C++ UnicodeSet/uset. 527// See tools/gennorm/store.c for usage, now usetContainsOne there. 528 529// TODO Investigate incorporating this code into UnicodeSet to improve 530// efficiency. 531// --- 532// #define USET_GROW_DELTA 20 533// 534// static U_INLINE int32_t 535// findChar(const UChar32* array, int32_t length, UChar32 c) { 536// int32_t i; 537// 538// /* check the last range limit first for more efficient appending */ 539// if(length>0) { 540// if(c>=array[length-1]) { 541// return length; 542// } 543// 544// /* do not check the last range limit again in the loop below */ 545// --length; 546// } 547// 548// for(i=0; i<length && c>=array[i]; ++i) {} 549// return i; 550// } 551// 552// static UBool 553// addRemove(USet* set, UChar32 c, int32_t doRemove) { 554// int32_t i, length, more; 555// 556// if(set==NULL || (uint32_t)c>0x10ffff) { 557// return FALSE; 558// } 559// 560// length=set->length; 561// i=findChar(set->array, length, c); 562// if((i&1)^doRemove) { 563// /* c is already in the set */ 564// return TRUE; 565// } 566// 567// /* how many more array items do we need? */ 568// if(i<length && (c+1)==set->array[i]) { 569// /* c is just before the following range, extend that in-place by one */ 570// set->array[i]=c; 571// if(i>0) { 572// --i; 573// if(c==set->array[i]) { 574// /* the previous range collapsed, remove it */ 575// set->length=length-=2; 576// if(i<length) { 577// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 578// } 579// } 580// } 581// return TRUE; 582// } else if(i>0 && c==set->array[i-1]) { 583// /* c is just after the previous range, extend that in-place by one */ 584// if(++c<=0x10ffff) { 585// set->array[i-1]=c; 586// if(i<length && c==set->array[i]) { 587// /* the following range collapsed, remove it */ 588// --i; 589// set->length=length-=2; 590// if(i<length) { 591// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 592// } 593// } 594// } else { 595// /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ 596// set->length=i-1; 597// } 598// return TRUE; 599// } else if(i==length && c==0x10ffff) { 600// /* insert one range limit c */ 601// more=1; 602// } else { 603// /* insert two range limits c, c+1 */ 604// more=2; 605// } 606// 607// /* insert <more> range limits */ 608// if(length+more>set->capacity) { 609// /* reallocate */ 610// int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; 611// UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); 612// if(newArray==NULL) { 613// return FALSE; 614// } 615// set->capacity=newCapacity; 616// uprv_memcpy(newArray, set->array, length*4); 617// 618// if(set->array!=set->staticBuffer) { 619// uprv_free(set->array); 620// } 621// set->array=newArray; 622// } 623// 624// if(i<length) { 625// uprv_memmove(set->array+i+more, set->array+i, (length-i)*4); 626// } 627// set->array[i]=c; 628// if(more==2) { 629// set->array[i+1]=c+1; 630// } 631// set->length+=more; 632// 633// return TRUE; 634// } 635// 636// U_CAPI UBool U_EXPORT2 637// uset_add(USet* set, UChar32 c) { 638// return addRemove(set, c, 0); 639// } 640// 641// U_CAPI void U_EXPORT2 642// uset_remove(USet* set, UChar32 c) { 643// addRemove(set, c, 1); 644// } 645