1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* 6* Copyright (C) 2002-2011, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9******************************************************************************* 10* file name: uset.cpp 11* encoding: UTF-8 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2002mar07 16* created by: Markus W. Scherer 17* 18* There are functions to efficiently serialize a USet into an array of uint16_t 19* and functions to use such a serialized form efficiently without 20* instantiating a new USet. 21*/ 22 23#include "unicode/utypes.h" 24#include "unicode/uobject.h" 25#include "unicode/uset.h" 26#include "unicode/uniset.h" 27#include "cmemory.h" 28#include "unicode/ustring.h" 29#include "unicode/parsepos.h" 30 31U_NAMESPACE_USE 32 33U_CAPI USet* U_EXPORT2 34uset_openEmpty() { 35 return (USet*) new UnicodeSet(); 36} 37 38U_CAPI USet* U_EXPORT2 39uset_open(UChar32 start, UChar32 end) { 40 return (USet*) new UnicodeSet(start, end); 41} 42 43U_CAPI void U_EXPORT2 44uset_close(USet* set) { 45 delete (UnicodeSet*) set; 46} 47 48U_CAPI USet * U_EXPORT2 49uset_clone(const USet *set) { 50 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone()); 51} 52 53U_CAPI UBool U_EXPORT2 54uset_isFrozen(const USet *set) { 55 return ((UnicodeSet*) set)->UnicodeSet::isFrozen(); 56} 57 58U_CAPI void U_EXPORT2 59uset_freeze(USet *set) { 60 ((UnicodeSet*) set)->UnicodeSet::freeze(); 61} 62 63U_CAPI USet * U_EXPORT2 64uset_cloneAsThawed(const USet *set) { 65 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed()); 66} 67 68U_CAPI void U_EXPORT2 69uset_set(USet* set, 70 UChar32 start, UChar32 end) { 71 ((UnicodeSet*) set)->UnicodeSet::set(start, end); 72} 73 74U_CAPI void U_EXPORT2 75uset_addAll(USet* set, const USet *additionalSet) { 76 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet)); 77} 78 79U_CAPI void U_EXPORT2 80uset_add(USet* set, UChar32 c) { 81 ((UnicodeSet*) set)->UnicodeSet::add(c); 82} 83 84U_CAPI void U_EXPORT2 85uset_addRange(USet* set, UChar32 start, UChar32 end) { 86 ((UnicodeSet*) set)->UnicodeSet::add(start, end); 87} 88 89U_CAPI void U_EXPORT2 90uset_addString(USet* set, const UChar* str, int32_t strLen) { 91 // UnicodeString handles -1 for strLen 92 UnicodeString s(strLen<0, str, strLen); 93 ((UnicodeSet*) set)->UnicodeSet::add(s); 94} 95 96U_CAPI void U_EXPORT2 97uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) { 98 // UnicodeString handles -1 for strLen 99 UnicodeString s(str, strLen); 100 ((UnicodeSet*) set)->UnicodeSet::addAll(s); 101} 102 103U_CAPI void U_EXPORT2 104uset_remove(USet* set, UChar32 c) { 105 ((UnicodeSet*) set)->UnicodeSet::remove(c); 106} 107 108U_CAPI void U_EXPORT2 109uset_removeRange(USet* set, UChar32 start, UChar32 end) { 110 ((UnicodeSet*) set)->UnicodeSet::remove(start, end); 111} 112 113U_CAPI void U_EXPORT2 114uset_removeString(USet* set, const UChar* str, int32_t strLen) { 115 UnicodeString s(strLen==-1, str, strLen); 116 ((UnicodeSet*) set)->UnicodeSet::remove(s); 117} 118 119U_CAPI void U_EXPORT2 120uset_removeAll(USet* set, const USet* remove) { 121 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove); 122} 123 124U_CAPI void U_EXPORT2 125uset_retain(USet* set, UChar32 start, UChar32 end) { 126 ((UnicodeSet*) set)->UnicodeSet::retain(start, end); 127} 128 129U_CAPI void U_EXPORT2 130uset_retainAll(USet* set, const USet* retain) { 131 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain); 132} 133 134U_CAPI void U_EXPORT2 135uset_compact(USet* set) { 136 ((UnicodeSet*) set)->UnicodeSet::compact(); 137} 138 139U_CAPI void U_EXPORT2 140uset_complement(USet* set) { 141 ((UnicodeSet*) set)->UnicodeSet::complement(); 142} 143 144U_CAPI void U_EXPORT2 145uset_complementAll(USet* set, const USet* complement) { 146 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement); 147} 148 149U_CAPI void U_EXPORT2 150uset_clear(USet* set) { 151 ((UnicodeSet*) set)->UnicodeSet::clear(); 152} 153 154U_CAPI void U_EXPORT2 155uset_removeAllStrings(USet* set) { 156 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings(); 157} 158 159U_CAPI UBool U_EXPORT2 160uset_isEmpty(const USet* set) { 161 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); 162} 163 164U_CAPI UBool U_EXPORT2 165uset_contains(const USet* set, UChar32 c) { 166 return ((const UnicodeSet*) set)->UnicodeSet::contains(c); 167} 168 169U_CAPI UBool U_EXPORT2 170uset_containsRange(const USet* set, UChar32 start, UChar32 end) { 171 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end); 172} 173 174U_CAPI UBool U_EXPORT2 175uset_containsString(const USet* set, const UChar* str, int32_t strLen) { 176 UnicodeString s(strLen==-1, str, strLen); 177 return ((const UnicodeSet*) set)->UnicodeSet::contains(s); 178} 179 180U_CAPI UBool U_EXPORT2 181uset_containsAll(const USet* set1, const USet* set2) { 182 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2); 183} 184 185U_CAPI UBool U_EXPORT2 186uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) { 187 // Create a string alias, since nothing is being added to the set. 188 UnicodeString s(strLen==-1, str, strLen); 189 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s); 190} 191 192U_CAPI UBool U_EXPORT2 193uset_containsNone(const USet* set1, const USet* set2) { 194 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2); 195} 196 197U_CAPI UBool U_EXPORT2 198uset_containsSome(const USet* set1, const USet* set2) { 199 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2); 200} 201 202U_CAPI int32_t U_EXPORT2 203uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 204 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition); 205} 206 207U_CAPI int32_t U_EXPORT2 208uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 209 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition); 210} 211 212U_CAPI int32_t U_EXPORT2 213uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 214 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition); 215} 216 217U_CAPI int32_t U_EXPORT2 218uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 219 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition); 220} 221 222U_CAPI UBool U_EXPORT2 223uset_equals(const USet* set1, const USet* set2) { 224 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; 225} 226 227U_CAPI int32_t U_EXPORT2 228uset_indexOf(const USet* set, UChar32 c) { 229 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c); 230} 231 232U_CAPI UChar32 U_EXPORT2 233uset_charAt(const USet* set, int32_t index) { 234 return ((UnicodeSet*) set)->UnicodeSet::charAt(index); 235} 236 237U_CAPI int32_t U_EXPORT2 238uset_size(const USet* set) { 239 return ((const UnicodeSet*) set)->UnicodeSet::size(); 240} 241 242U_NAMESPACE_BEGIN 243/** 244 * This class only exists to provide access to the UnicodeSet private 245 * USet support API. Declaring a class a friend is more portable than 246 * trying to declare extern "C" functions as friends. 247 */ 248class USetAccess /* not : public UObject because all methods are static */ { 249public: 250 /* Try to have the compiler inline these*/ 251 inline static int32_t getStringCount(const UnicodeSet& set) { 252 return set.getStringCount(); 253 } 254 inline static const UnicodeString* getString(const UnicodeSet& set, 255 int32_t i) { 256 return set.getString(i); 257 } 258private: 259 /* do not instantiate*/ 260 USetAccess(); 261}; 262U_NAMESPACE_END 263 264U_CAPI int32_t U_EXPORT2 265uset_getItemCount(const USet* uset) { 266 const UnicodeSet& set = *(const UnicodeSet*)uset; 267 return set.getRangeCount() + USetAccess::getStringCount(set); 268} 269 270U_CAPI int32_t U_EXPORT2 271uset_getItem(const USet* uset, int32_t itemIndex, 272 UChar32* start, UChar32* end, 273 UChar* str, int32_t strCapacity, 274 UErrorCode* ec) { 275 if (U_FAILURE(*ec)) return 0; 276 const UnicodeSet& set = *(const UnicodeSet*)uset; 277 int32_t rangeCount; 278 279 if (itemIndex < 0) { 280 *ec = U_ILLEGAL_ARGUMENT_ERROR; 281 return -1; 282 } else if (itemIndex < (rangeCount = set.getRangeCount())) { 283 *start = set.getRangeStart(itemIndex); 284 *end = set.getRangeEnd(itemIndex); 285 return 0; 286 } else { 287 itemIndex -= rangeCount; 288 if (itemIndex < USetAccess::getStringCount(set)) { 289 const UnicodeString* s = USetAccess::getString(set, itemIndex); 290 return s->extract(str, strCapacity, *ec); 291 } else { 292 *ec = U_INDEX_OUTOFBOUNDS_ERROR; 293 return -1; 294 } 295 } 296} 297 298//U_CAPI int32_t U_EXPORT2 299//uset_getRangeCount(const USet* set) { 300// return ((const UnicodeSet*) set)->getRangeCount(); 301//} 302// 303//U_CAPI UBool U_EXPORT2 304//uset_getRange(const USet* set, int32_t rangeIndex, 305// UChar32* pStart, UChar32* pEnd) { 306// if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { 307// return FALSE; 308// } 309// const UnicodeSet* us = (const UnicodeSet*) set; 310// *pStart = us->getRangeStart(rangeIndex); 311// *pEnd = us->getRangeEnd(rangeIndex); 312// return TRUE; 313//} 314 315/* 316 * Serialize a USet into 16-bit units. 317 * Store BMP code points as themselves with one 16-bit unit each. 318 * 319 * Important: the code points in the array are in ascending order, 320 * therefore all BMP code points precede all supplementary code points. 321 * 322 * Store each supplementary code point in 2 16-bit units, 323 * simply with higher-then-lower 16-bit halfs. 324 * 325 * Precede the entire list with the length. 326 * If there are supplementary code points, then set bit 15 in the length 327 * and add the bmpLength between it and the array. 328 * 329 * In other words: 330 * - all BMP: (length=bmpLength) BMP, .., BMP 331 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, .. 332 */ 333U_CAPI int32_t U_EXPORT2 334uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) { 335 if (ec==NULL || U_FAILURE(*ec)) { 336 return 0; 337 } 338 339 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec); 340} 341 342U_CAPI UBool U_EXPORT2 343uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) { 344 int32_t length; 345 346 if(fillSet==NULL) { 347 return FALSE; 348 } 349 if(src==NULL || srcLength<=0) { 350 fillSet->length=fillSet->bmpLength=0; 351 return FALSE; 352 } 353 354 length=*src++; 355 if(length&0x8000) { 356 /* there are supplementary values */ 357 length&=0x7fff; 358 if(srcLength<(2+length)) { 359 fillSet->length=fillSet->bmpLength=0; 360 return FALSE; 361 } 362 fillSet->bmpLength=*src++; 363 } else { 364 /* only BMP values */ 365 if(srcLength<(1+length)) { 366 fillSet->length=fillSet->bmpLength=0; 367 return FALSE; 368 } 369 fillSet->bmpLength=length; 370 } 371 fillSet->array=src; 372 fillSet->length=length; 373 return TRUE; 374} 375 376U_CAPI void U_EXPORT2 377uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) { 378 if(fillSet==NULL || (uint32_t)c>0x10ffff) { 379 return; 380 } 381 382 fillSet->array=fillSet->staticArray; 383 if(c<0xffff) { 384 fillSet->bmpLength=fillSet->length=2; 385 fillSet->staticArray[0]=(uint16_t)c; 386 fillSet->staticArray[1]=(uint16_t)c+1; 387 } else if(c==0xffff) { 388 fillSet->bmpLength=1; 389 fillSet->length=3; 390 fillSet->staticArray[0]=0xffff; 391 fillSet->staticArray[1]=1; 392 fillSet->staticArray[2]=0; 393 } else if(c<0x10ffff) { 394 fillSet->bmpLength=0; 395 fillSet->length=4; 396 fillSet->staticArray[0]=(uint16_t)(c>>16); 397 fillSet->staticArray[1]=(uint16_t)c; 398 ++c; 399 fillSet->staticArray[2]=(uint16_t)(c>>16); 400 fillSet->staticArray[3]=(uint16_t)c; 401 } else /* c==0x10ffff */ { 402 fillSet->bmpLength=0; 403 fillSet->length=2; 404 fillSet->staticArray[0]=0x10; 405 fillSet->staticArray[1]=0xffff; 406 } 407} 408 409U_CAPI UBool U_EXPORT2 410uset_serializedContains(const USerializedSet* set, UChar32 c) { 411 const uint16_t* array; 412 413 if(set==NULL || (uint32_t)c>0x10ffff) { 414 return FALSE; 415 } 416 417 array=set->array; 418 if(c<=0xffff) { 419 /* find c in the BMP part */ 420 int32_t lo = 0; 421 int32_t hi = set->bmpLength-1; 422 if (c < array[0]) { 423 hi = 0; 424 } else if (c < array[hi]) { 425 for(;;) { 426 int32_t i = (lo + hi) >> 1; 427 if (i == lo) { 428 break; // Done! 429 } else if (c < array[i]) { 430 hi = i; 431 } else { 432 lo = i; 433 } 434 } 435 } else { 436 hi += 1; 437 } 438 return (UBool)(hi&1); 439 } else { 440 /* find c in the supplementary part */ 441 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c; 442 int32_t base = set->bmpLength; 443 int32_t lo = 0; 444 int32_t hi = set->length - 2 - base; 445 if (high < array[base] || (high==array[base] && low<array[base+1])) { 446 hi = 0; 447 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) { 448 for (;;) { 449 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result 450 int32_t iabs = i + base; 451 if (i == lo) { 452 break; // Done! 453 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) { 454 hi = i; 455 } else { 456 lo = i; 457 } 458 } 459 } else { 460 hi += 2; 461 } 462 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */ 463 return (UBool)(((hi+(base<<1))&2)!=0); 464 } 465} 466 467U_CAPI int32_t U_EXPORT2 468uset_getSerializedRangeCount(const USerializedSet* set) { 469 if(set==NULL) { 470 return 0; 471 } 472 473 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2; 474} 475 476U_CAPI UBool U_EXPORT2 477uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 478 UChar32* pStart, UChar32* pEnd) { 479 const uint16_t* array; 480 int32_t bmpLength, length; 481 482 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) { 483 return FALSE; 484 } 485 486 array=set->array; 487 length=set->length; 488 bmpLength=set->bmpLength; 489 490 rangeIndex*=2; /* address start/limit pairs */ 491 if(rangeIndex<bmpLength) { 492 *pStart=array[rangeIndex++]; 493 if(rangeIndex<bmpLength) { 494 *pEnd=array[rangeIndex]-1; 495 } else if(rangeIndex<length) { 496 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 497 } else { 498 *pEnd=0x10ffff; 499 } 500 return TRUE; 501 } else { 502 rangeIndex-=bmpLength; 503 rangeIndex*=2; /* address pairs of pairs of units */ 504 length-=bmpLength; 505 if(rangeIndex<length) { 506 array+=bmpLength; 507 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; 508 rangeIndex+=2; 509 if(rangeIndex<length) { 510 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 511 } else { 512 *pEnd=0x10ffff; 513 } 514 return TRUE; 515 } else { 516 return FALSE; 517 } 518 } 519} 520 521// TODO The old, internal uset.c had an efficient uset_containsOne function. 522// Returned the one and only code point, or else -1 or something. 523// Consider adding such a function to both C and C++ UnicodeSet/uset. 524// See tools/gennorm/store.c for usage, now usetContainsOne there. 525 526// TODO Investigate incorporating this code into UnicodeSet to improve 527// efficiency. 528// --- 529// #define USET_GROW_DELTA 20 530// 531// static int32_t 532// findChar(const UChar32* array, int32_t length, UChar32 c) { 533// int32_t i; 534// 535// /* check the last range limit first for more efficient appending */ 536// if(length>0) { 537// if(c>=array[length-1]) { 538// return length; 539// } 540// 541// /* do not check the last range limit again in the loop below */ 542// --length; 543// } 544// 545// for(i=0; i<length && c>=array[i]; ++i) {} 546// return i; 547// } 548// 549// static UBool 550// addRemove(USet* set, UChar32 c, int32_t doRemove) { 551// int32_t i, length, more; 552// 553// if(set==NULL || (uint32_t)c>0x10ffff) { 554// return FALSE; 555// } 556// 557// length=set->length; 558// i=findChar(set->array, length, c); 559// if((i&1)^doRemove) { 560// /* c is already in the set */ 561// return TRUE; 562// } 563// 564// /* how many more array items do we need? */ 565// if(i<length && (c+1)==set->array[i]) { 566// /* c is just before the following range, extend that in-place by one */ 567// set->array[i]=c; 568// if(i>0) { 569// --i; 570// if(c==set->array[i]) { 571// /* the previous range collapsed, remove it */ 572// set->length=length-=2; 573// if(i<length) { 574// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 575// } 576// } 577// } 578// return TRUE; 579// } else if(i>0 && c==set->array[i-1]) { 580// /* c is just after the previous range, extend that in-place by one */ 581// if(++c<=0x10ffff) { 582// set->array[i-1]=c; 583// if(i<length && c==set->array[i]) { 584// /* the following range collapsed, remove it */ 585// --i; 586// set->length=length-=2; 587// if(i<length) { 588// uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 589// } 590// } 591// } else { 592// /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ 593// set->length=i-1; 594// } 595// return TRUE; 596// } else if(i==length && c==0x10ffff) { 597// /* insert one range limit c */ 598// more=1; 599// } else { 600// /* insert two range limits c, c+1 */ 601// more=2; 602// } 603// 604// /* insert <more> range limits */ 605// if(length+more>set->capacity) { 606// /* reallocate */ 607// int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; 608// UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); 609// if(newArray==NULL) { 610// return FALSE; 611// } 612// set->capacity=newCapacity; 613// uprv_memcpy(newArray, set->array, length*4); 614// 615// if(set->array!=set->staticBuffer) { 616// uprv_free(set->array); 617// } 618// set->array=newArray; 619// } 620// 621// if(i<length) { 622// uprv_memmove(set->array+i+more, set->array+i, (length-i)*4); 623// } 624// set->array[i]=c; 625// if(more==2) { 626// set->array[i+1]=c+1; 627// } 628// set->length+=more; 629// 630// return TRUE; 631// } 632// 633// U_CAPI UBool U_EXPORT2 634// uset_add(USet* set, UChar32 c) { 635// return addRemove(set, c, 0); 636// } 637// 638// U_CAPI void U_EXPORT2 639// uset_remove(USet* set, UChar32 c) { 640// addRemove(set, c, 1); 641// } 642