1/* 2******************************************************************************* 3* 4* Copyright (C) 2002-2011, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: uset.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2002mar07 14* created by: Markus W. Scherer 15* 16* There are functions to efficiently serialize a USet into an array of uint16_t 17* and functions to use such a serialized form efficiently without 18* instantiating a new USet. 19*/ 20 21#include "unicode/utypes.h" 22#include "unicode/uobject.h" 23#include "unicode/uset.h" 24#include "unicode/uniset.h" 25#include "cmemory.h" 26#include "unicode/ustring.h" 27#include "unicode/parsepos.h" 28 29U_NAMESPACE_USE 30 31U_CAPI USet* U_EXPORT2 32uset_openEmpty() { 33 return (USet*) new UnicodeSet(); 34} 35 36U_CAPI USet* U_EXPORT2 37uset_open(UChar32 start, UChar32 end) { 38 return (USet*) new UnicodeSet(start, end); 39} 40 41U_CAPI void U_EXPORT2 42uset_close(USet* set) { 43 delete (UnicodeSet*) set; 44} 45 46U_CAPI USet * U_EXPORT2 47uset_clone(const USet *set) { 48 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone()); 49} 50 51U_CAPI UBool U_EXPORT2 52uset_isFrozen(const USet *set) { 53 return ((UnicodeSet*) set)->UnicodeSet::isFrozen(); 54} 55 56U_CAPI void U_EXPORT2 57uset_freeze(USet *set) { 58 ((UnicodeSet*) set)->UnicodeSet::freeze(); 59} 60 61U_CAPI USet * U_EXPORT2 62uset_cloneAsThawed(const USet *set) { 63 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed()); 64} 65 66U_CAPI void U_EXPORT2 67uset_set(USet* set, 68 UChar32 start, UChar32 end) { 69 ((UnicodeSet*) set)->UnicodeSet::set(start, end); 70} 71 72U_CAPI void U_EXPORT2 73uset_addAll(USet* set, const USet *additionalSet) { 74 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet)); 75} 76 77U_CAPI void U_EXPORT2 78uset_add(USet* set, UChar32 c) { 79 ((UnicodeSet*) set)->UnicodeSet::add(c); 80} 81 82U_CAPI void U_EXPORT2 83uset_addRange(USet* set, UChar32 start, UChar32 end) { 84 ((UnicodeSet*) set)->UnicodeSet::add(start, end); 85} 86 87U_CAPI void U_EXPORT2 88uset_addString(USet* set, const UChar* str, int32_t strLen) { 89 // UnicodeString handles -1 for strLen 90 UnicodeString s(strLen<0, str, strLen); 91 ((UnicodeSet*) set)->UnicodeSet::add(s); 92} 93 94U_CAPI void U_EXPORT2 95uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) { 96 // UnicodeString handles -1 for strLen 97 UnicodeString s(str, strLen); 98 ((UnicodeSet*) set)->UnicodeSet::addAll(s); 99} 100 101U_CAPI void U_EXPORT2 102uset_remove(USet* set, UChar32 c) { 103 ((UnicodeSet*) set)->UnicodeSet::remove(c); 104} 105 106U_CAPI void U_EXPORT2 107uset_removeRange(USet* set, UChar32 start, UChar32 end) { 108 ((UnicodeSet*) set)->UnicodeSet::remove(start, end); 109} 110 111U_CAPI void U_EXPORT2 112uset_removeString(USet* set, const UChar* str, int32_t strLen) { 113 UnicodeString s(strLen==-1, str, strLen); 114 ((UnicodeSet*) set)->UnicodeSet::remove(s); 115} 116 117U_CAPI void U_EXPORT2 118uset_removeAll(USet* set, const USet* remove) { 119 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove); 120} 121 122U_CAPI void U_EXPORT2 123uset_retain(USet* set, UChar32 start, UChar32 end) { 124 ((UnicodeSet*) set)->UnicodeSet::retain(start, end); 125} 126 127U_CAPI void U_EXPORT2 128uset_retainAll(USet* set, const USet* retain) { 129 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain); 130} 131 132U_CAPI void U_EXPORT2 133uset_compact(USet* set) { 134 ((UnicodeSet*) set)->UnicodeSet::compact(); 135} 136 137U_CAPI void U_EXPORT2 138uset_complement(USet* set) { 139 ((UnicodeSet*) set)->UnicodeSet::complement(); 140} 141 142U_CAPI void U_EXPORT2 143uset_complementAll(USet* set, const USet* complement) { 144 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement); 145} 146 147U_CAPI void U_EXPORT2 148uset_clear(USet* set) { 149 ((UnicodeSet*) set)->UnicodeSet::clear(); 150} 151 152U_CAPI void U_EXPORT2 153uset_removeAllStrings(USet* set) { 154 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings(); 155} 156 157U_CAPI UBool U_EXPORT2 158uset_isEmpty(const USet* set) { 159 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); 160} 161 162U_CAPI UBool U_EXPORT2 163uset_contains(const USet* set, UChar32 c) { 164 return ((const UnicodeSet*) set)->UnicodeSet::contains(c); 165} 166 167U_CAPI UBool U_EXPORT2 168uset_containsRange(const USet* set, UChar32 start, UChar32 end) { 169 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end); 170} 171 172U_CAPI UBool U_EXPORT2 173uset_containsString(const USet* set, const UChar* str, int32_t strLen) { 174 UnicodeString s(strLen==-1, str, strLen); 175 return ((const UnicodeSet*) set)->UnicodeSet::contains(s); 176} 177 178U_CAPI UBool U_EXPORT2 179uset_containsAll(const USet* set1, const USet* set2) { 180 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2); 181} 182 183U_CAPI UBool U_EXPORT2 184uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) { 185 // Create a string alias, since nothing is being added to the set. 186 UnicodeString s(strLen==-1, str, strLen); 187 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s); 188} 189 190U_CAPI UBool U_EXPORT2 191uset_containsNone(const USet* set1, const USet* set2) { 192 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2); 193} 194 195U_CAPI UBool U_EXPORT2 196uset_containsSome(const USet* set1, const USet* set2) { 197 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2); 198} 199 200U_CAPI int32_t U_EXPORT2 201uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 202 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition); 203} 204 205U_CAPI int32_t U_EXPORT2 206uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 207 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition); 208} 209 210U_CAPI int32_t U_EXPORT2 211uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 212 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition); 213} 214 215U_CAPI int32_t U_EXPORT2 216uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 217 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition); 218} 219 220U_CAPI UBool U_EXPORT2 221uset_equals(const USet* set1, const USet* set2) { 222 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; 223} 224 225U_CAPI int32_t U_EXPORT2 226uset_indexOf(const USet* set, UChar32 c) { 227 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c); 228} 229 230U_CAPI UChar32 U_EXPORT2 231uset_charAt(const USet* set, int32_t index) { 232 return ((UnicodeSet*) set)->UnicodeSet::charAt(index); 233} 234 235U_CAPI int32_t U_EXPORT2 236uset_size(const USet* set) { 237 return ((const UnicodeSet*) set)->UnicodeSet::size(); 238} 239 240U_NAMESPACE_BEGIN 241/** 242 * This class only exists to provide access to the UnicodeSet private 243 * USet support API. Declaring a class a friend is more portable than 244 * trying to declare extern "C" functions as friends. 245 */ 246class USetAccess /* not : public UObject because all methods are static */ { 247public: 248 /* Try to have the compiler inline these*/ 249 inline static int32_t getStringCount(const UnicodeSet& set) { 250 return set.getStringCount(); 251 } 252 inline static const UnicodeString* getString(const UnicodeSet& set, 253 int32_t i) { 254 return set.getString(i); 255 } 256private: 257 /* do not instantiate*/ 258 USetAccess(); 259}; 260U_NAMESPACE_END 261 262U_CAPI int32_t U_EXPORT2 263uset_getItemCount(const USet* uset) { 264 const UnicodeSet& set = *(const UnicodeSet*)uset; 265 return set.getRangeCount() + USetAccess::getStringCount(set); 266} 267 268U_CAPI int32_t U_EXPORT2 269uset_getItem(const USet* uset, int32_t itemIndex, 270 UChar32* start, UChar32* end, 271 UChar* str, int32_t strCapacity, 272 UErrorCode* ec) { 273 if (U_FAILURE(*ec)) return 0; 274 const UnicodeSet& set = *(const UnicodeSet*)uset; 275 int32_t rangeCount; 276 277 if (itemIndex < 0) { 278 *ec = U_ILLEGAL_ARGUMENT_ERROR; 279 return -1; 280 } else if (itemIndex < (rangeCount = set.getRangeCount())) { 281 *start = set.getRangeStart(itemIndex); 282 *end = set.getRangeEnd(itemIndex); 283 return 0; 284 } else { 285 itemIndex -= rangeCount; 286 if (itemIndex < USetAccess::getStringCount(set)) { 287 const UnicodeString* s = USetAccess::getString(set, itemIndex); 288 return s->extract(str, strCapacity, *ec); 289 } else { 290 *ec = U_INDEX_OUTOFBOUNDS_ERROR; 291 return -1; 292 } 293 } 294} 295 296//U_CAPI int32_t U_EXPORT2 297//uset_getRangeCount(const USet* set) { 298// return ((const UnicodeSet*) set)->getRangeCount(); 299//} 300// 301//U_CAPI UBool U_EXPORT2 302//uset_getRange(const USet* set, int32_t rangeIndex, 303// UChar32* pStart, UChar32* pEnd) { 304// if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { 305// return FALSE; 306// } 307// const UnicodeSet* us = (const UnicodeSet*) set; 308// *pStart = us->getRangeStart(rangeIndex); 309// *pEnd = us->getRangeEnd(rangeIndex); 310// return TRUE; 311//} 312 313/* 314 * Serialize a USet into 16-bit units. 315 * Store BMP code points as themselves with one 16-bit unit each. 316 * 317 * Important: the code points in the array are in ascending order, 318 * therefore all BMP code points precede all supplementary code points. 319 * 320 * Store each supplementary code point in 2 16-bit units, 321 * simply with higher-then-lower 16-bit halfs. 322 * 323 * Precede the entire list with the length. 324 * If there are supplementary code points, then set bit 15 in the length 325 * and add the bmpLength between it and the array. 326 * 327 * In other words: 328 * - all BMP: (length=bmpLength) BMP, .., BMP 329 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, .. 330 */ 331U_CAPI int32_t U_EXPORT2 332uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) { 333 if (ec==NULL || U_FAILURE(*ec)) { 334 return 0; 335 } 336 337 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec); 338} 339 340U_CAPI UBool U_EXPORT2 341uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) { 342 int32_t length; 343 344 if(fillSet==NULL) { 345 return FALSE; 346 } 347 if(src==NULL || srcLength<=0) { 348 fillSet->length=fillSet->bmpLength=0; 349 return FALSE; 350 } 351 352 length=*src++; 353 if(length&0x8000) { 354 /* there are supplementary values */ 355 length&=0x7fff; 356 if(srcLength<(2+length)) { 357 fillSet->length=fillSet->bmpLength=0; 358 return FALSE; 359 } 360 fillSet->bmpLength=*src++; 361 } else { 362 /* only BMP values */ 363 if(srcLength<(1+length)) { 364 fillSet->length=fillSet->bmpLength=0; 365 return FALSE; 366 } 367 fillSet->bmpLength=length; 368 } 369 fillSet->array=src; 370 fillSet->length=length; 371 return TRUE; 372} 373 374U_CAPI void U_EXPORT2 375uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) { 376 if(fillSet==NULL || (uint32_t)c>0x10ffff) { 377 return; 378 } 379 380 fillSet->array=fillSet->staticArray; 381 if(c<0xffff) { 382 fillSet->bmpLength=fillSet->length=2; 383 fillSet->staticArray[0]=(uint16_t)c; 384 fillSet->staticArray[1]=(uint16_t)c+1; 385 } else if(c==0xffff) { 386 fillSet->bmpLength=1; 387 fillSet->length=3; 388 fillSet->staticArray[0]=0xffff; 389 fillSet->staticArray[1]=1; 390 fillSet->staticArray[2]=0; 391 } else if(c<0x10ffff) { 392 fillSet->bmpLength=0; 393 fillSet->length=4; 394 fillSet->staticArray[0]=(uint16_t)(c>>16); 395 fillSet->staticArray[1]=(uint16_t)c; 396 ++c; 397 fillSet->staticArray[2]=(uint16_t)(c>>16); 398 fillSet->staticArray[3]=(uint16_t)c; 399 } else /* c==0x10ffff */ { 400 fillSet->bmpLength=0; 401 fillSet->length=2; 402 fillSet->staticArray[0]=0x10; 403 fillSet->staticArray[1]=0xffff; 404 } 405} 406 407U_CAPI UBool U_EXPORT2 408uset_serializedContains(const USerializedSet* set, UChar32 c) { 409 const uint16_t* array; 410 411 if(set==NULL || (uint32_t)c>0x10ffff) { 412 return FALSE; 413 } 414 415 array=set->array; 416 if(c<=0xffff) { 417 /* find c in the BMP part */ 418 int32_t lo = 0; 419 int32_t hi = set->bmpLength-1; 420 if (c < array[0]) { 421 hi = 0; 422 } else if (c < array[hi]) { 423 for(;;) { 424 int32_t i = (lo + hi) >> 1; 425 if (i == lo) { 426 break; // Done! 427 } else if (c < array[i]) { 428 hi = i; 429 } else { 430 lo = i; 431 } 432 } 433 } else { 434 hi += 1; 435 } 436 return (UBool)(hi&1); 437 } else { 438 /* find c in the supplementary part */ 439 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c; 440 int32_t base = set->bmpLength; 441 int32_t lo = 0; 442 int32_t hi = set->length - 2 - base; 443 if (high < array[base] || (high==array[base] && low<array[base+1])) { 444 hi = 0; 445 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) { 446 for (;;) { 447 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result 448 int32_t iabs = i + base; 449 if (i == lo) { 450 break; // Done! 451 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) { 452 hi = i; 453 } else { 454 lo = i; 455 } 456 } 457 } else { 458 hi += 2; 459 } 460 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */ 461 return (UBool)(((hi+(base<<1))&2)!=0); 462 } 463} 464 465U_CAPI int32_t U_EXPORT2 466uset_getSerializedRangeCount(const USerializedSet* set) { 467 if(set==NULL) { 468 return 0; 469 } 470 471 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2; 472} 473 474U_CAPI UBool U_EXPORT2 475uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 476 UChar32* pStart, UChar32* pEnd) { 477 const uint16_t* array; 478 int32_t bmpLength, length; 479 480 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) { 481 return FALSE; 482 } 483 484 array=set->array; 485 length=set->length; 486 bmpLength=set->bmpLength; 487 488 rangeIndex*=2; /* address start/limit pairs */ 489 if(rangeIndex<bmpLength) { 490 *pStart=array[rangeIndex++]; 491 if(rangeIndex<bmpLength) { 492 *pEnd=array[rangeIndex]-1; 493 } else if(rangeIndex<length) { 494 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 495 } else { 496 *pEnd=0x10ffff; 497 } 498 return TRUE; 499 } else { 500 rangeIndex-=bmpLength; 501 rangeIndex*=2; /* address pairs of pairs of units */ 502 length-=bmpLength; 503 if(rangeIndex<length) { 504 array+=bmpLength; 505 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; 506 rangeIndex+=2; 507 if(rangeIndex<length) { 508 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 509 } else { 510 *pEnd=0x10ffff; 511 } 512 return TRUE; 513 } else { 514 return FALSE; 515 } 516 } 517} 518 519// TODO The old, internal uset.c had an efficient uset_containsOne function. 520// Returned the one and only code point, or else -1 or something. 521// Consider adding such a function to both C and C++ UnicodeSet/uset. 522// See tools/gennorm/store.c for usage, now usetContainsOne there. 523 524// TODO Investigate incorporating this code into UnicodeSet to improve 525// efficiency. 526// --- 527// #define USET_GROW_DELTA 20 528// 529// static int32_t 530// findChar(const UChar32* array, int32_t length, UChar32 c) { 531// int32_t i; 532// 533// /* check the last range limit first for more efficient appending */ 534// if(length>0) { 535// if(c>=array[length-1]) { 536// return length; 537// } 538// 539// /* do not check the last range limit again in the loop below */ 540// --length; 541// } 542// 543// for(i=0; i<length && c>=array[i]; ++i) {} 544// return i; 545// } 546// 547// static UBool 548// addRemove(USet* set, UChar32 c, int32_t doRemove) { 549// int32_t i, length, more; 550// 551// if(set==NULL || (uint32_t)c>0x10ffff) { 552// return FALSE; 553// } 554// 555// length=set->length; 556// i=findChar(set->array, length, c); 557// if((i&1)^doRemove) { 558// /* c is already in the set */ 559// return TRUE; 560// } 561// 562// /* how many more array items do we need? */ 563// if(i<length && (c+1)==set->array[i]) { 564// /* c is just before the following range, extend that in-place by one */ 565// set->array[i]=c; 566// if(i>0) { 567// --i; 568// if(c==set->array[i]) { 569// /* the previous range collapsed, remove it */ 570// set->length=length-=2; 571// if(i<length) { 572// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 573// } 574// } 575// } 576// return TRUE; 577// } else if(i>0 && c==set->array[i-1]) { 578// /* c is just after the previous range, extend that in-place by one */ 579// if(++c<=0x10ffff) { 580// set->array[i-1]=c; 581// if(i<length && c==set->array[i]) { 582// /* the following range collapsed, remove it */ 583// --i; 584// set->length=length-=2; 585// if(i<length) { 586// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 587// } 588// } 589// } else { 590// /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ 591// set->length=i-1; 592// } 593// return TRUE; 594// } else if(i==length && c==0x10ffff) { 595// /* insert one range limit c */ 596// more=1; 597// } else { 598// /* insert two range limits c, c+1 */ 599// more=2; 600// } 601// 602// /* insert <more> range limits */ 603// if(length+more>set->capacity) { 604// /* reallocate */ 605// int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; 606// UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); 607// if(newArray==NULL) { 608// return FALSE; 609// } 610// set->capacity=newCapacity; 611// uprv_memcpy(newArray, set->array, length*4); 612// 613// if(set->array!=set->staticBuffer) { 614// uprv_free(set->array); 615// } 616// set->array=newArray; 617// } 618// 619// if(i<length) { 620// uprv_memmove(set->array+i+more, set->array+i, (length-i)*4); 621// } 622// set->array[i]=c; 623// if(more==2) { 624// set->array[i+1]=c+1; 625// } 626// set->length+=more; 627// 628// return TRUE; 629// } 630// 631// U_CAPI UBool U_EXPORT2 632// uset_add(USet* set, UChar32 c) { 633// return addRemove(set, c, 0); 634// } 635// 636// U_CAPI void U_EXPORT2 637// uset_remove(USet* set, UChar32 c) { 638// addRemove(set, c, 1); 639// } 640