1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4****************************************************************************** 5* Copyright (C) 2001-2016, International Business Machines 6* Corporation and others. All Rights Reserved. 7****************************************************************************** 8* 9* File ucoleitr.cpp 10* 11* Modification History: 12* 13* Date Name Description 14* 02/15/2001 synwee Modified all methods to process its own function 15* instead of calling the equivalent c++ api (coleitr.h) 16* 2012-2014 markus Rewritten in C++ again. 17******************************************************************************/ 18 19#include "unicode/utypes.h" 20 21#if !UCONFIG_NO_COLLATION 22 23#include "unicode/coleitr.h" 24#include "unicode/tblcoll.h" 25#include "unicode/ucoleitr.h" 26#include "unicode/ustring.h" 27#include "unicode/sortkey.h" 28#include "unicode/uobject.h" 29#include "cmemory.h" 30#include "usrchimp.h" 31 32U_NAMESPACE_USE 33 34#define BUFFER_LENGTH 100 35 36#define DEFAULT_BUFFER_SIZE 16 37#define BUFFER_GROW 8 38 39#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0]) 40 41#define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type)) 42 43#define DELETE_ARRAY(array) uprv_free((void *) (array)) 44 45struct RCEI 46{ 47 uint32_t ce; 48 int32_t low; 49 int32_t high; 50}; 51 52U_NAMESPACE_BEGIN 53 54struct RCEBuffer 55{ 56 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 57 RCEI *buffer; 58 int32_t bufferIndex; 59 int32_t bufferSize; 60 61 RCEBuffer(); 62 ~RCEBuffer(); 63 64 UBool isEmpty() const; 65 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); 66 const RCEI *get(); 67}; 68 69RCEBuffer::RCEBuffer() 70{ 71 buffer = defaultBuffer; 72 bufferIndex = 0; 73 bufferSize = UPRV_LENGTHOF(defaultBuffer); 74} 75 76RCEBuffer::~RCEBuffer() 77{ 78 if (buffer != defaultBuffer) { 79 DELETE_ARRAY(buffer); 80 } 81} 82 83UBool RCEBuffer::isEmpty() const 84{ 85 return bufferIndex <= 0; 86} 87 88void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) 89{ 90 if (U_FAILURE(errorCode)) { 91 return; 92 } 93 if (bufferIndex >= bufferSize) { 94 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); 95 if (newBuffer == NULL) { 96 errorCode = U_MEMORY_ALLOCATION_ERROR; 97 return; 98 } 99 100 ARRAY_COPY(newBuffer, buffer, bufferSize); 101 102 if (buffer != defaultBuffer) { 103 DELETE_ARRAY(buffer); 104 } 105 106 buffer = newBuffer; 107 bufferSize += BUFFER_GROW; 108 } 109 110 buffer[bufferIndex].ce = ce; 111 buffer[bufferIndex].low = ixLow; 112 buffer[bufferIndex].high = ixHigh; 113 114 bufferIndex += 1; 115} 116 117const RCEI *RCEBuffer::get() 118{ 119 if (bufferIndex > 0) { 120 return &buffer[--bufferIndex]; 121 } 122 123 return NULL; 124} 125 126PCEBuffer::PCEBuffer() 127{ 128 buffer = defaultBuffer; 129 bufferIndex = 0; 130 bufferSize = UPRV_LENGTHOF(defaultBuffer); 131} 132 133PCEBuffer::~PCEBuffer() 134{ 135 if (buffer != defaultBuffer) { 136 DELETE_ARRAY(buffer); 137 } 138} 139 140void PCEBuffer::reset() 141{ 142 bufferIndex = 0; 143} 144 145UBool PCEBuffer::isEmpty() const 146{ 147 return bufferIndex <= 0; 148} 149 150void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) 151{ 152 if (U_FAILURE(errorCode)) { 153 return; 154 } 155 if (bufferIndex >= bufferSize) { 156 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); 157 if (newBuffer == NULL) { 158 errorCode = U_MEMORY_ALLOCATION_ERROR; 159 return; 160 } 161 162 ARRAY_COPY(newBuffer, buffer, bufferSize); 163 164 if (buffer != defaultBuffer) { 165 DELETE_ARRAY(buffer); 166 } 167 168 buffer = newBuffer; 169 bufferSize += BUFFER_GROW; 170 } 171 172 buffer[bufferIndex].ce = ce; 173 buffer[bufferIndex].low = ixLow; 174 buffer[bufferIndex].high = ixHigh; 175 176 bufferIndex += 1; 177} 178 179const PCEI *PCEBuffer::get() 180{ 181 if (bufferIndex > 0) { 182 return &buffer[--bufferIndex]; 183 } 184 185 return NULL; 186} 187 188UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); } 189 190UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); } 191 192void UCollationPCE::init(UCollationElements *elems) { 193 init(CollationElementIterator::fromUCollationElements(elems)); 194} 195 196void UCollationPCE::init(CollationElementIterator *iter) 197{ 198 cei = iter; 199 init(*iter->rbc_); 200} 201 202void UCollationPCE::init(const Collator &coll) 203{ 204 UErrorCode status = U_ZERO_ERROR; 205 206 strength = coll.getAttribute(UCOL_STRENGTH, status); 207 toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; 208 isShifted = FALSE; 209 variableTop = coll.getVariableTop(status); 210} 211 212UCollationPCE::~UCollationPCE() 213{ 214 // nothing to do 215} 216 217uint64_t UCollationPCE::processCE(uint32_t ce) 218{ 219 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 220 221 // This is clean, but somewhat slow... 222 // We could apply the mask to ce and then 223 // just get all three orders... 224 switch(strength) { 225 default: 226 tertiary = ucol_tertiaryOrder(ce); 227 U_FALLTHROUGH; 228 229 case UCOL_SECONDARY: 230 secondary = ucol_secondaryOrder(ce); 231 U_FALLTHROUGH; 232 233 case UCOL_PRIMARY: 234 primary = ucol_primaryOrder(ce); 235 } 236 237 // **** This should probably handle continuations too. **** 238 // **** That means that we need 24 bits for the primary **** 239 // **** instead of the 16 that we're currently using. **** 240 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 241 // **** Another complication with continuations is that **** 242 // **** the *second* CE is marked as a continuation, so **** 243 // **** we always have to peek ahead to know how long **** 244 // **** the primary is... **** 245 if ((toShift && variableTop > ce && primary != 0) 246 || (isShifted && primary == 0)) { 247 248 if (primary == 0) { 249 return UCOL_IGNORABLE; 250 } 251 252 if (strength >= UCOL_QUATERNARY) { 253 quaternary = primary; 254 } 255 256 primary = secondary = tertiary = 0; 257 isShifted = TRUE; 258 } else { 259 if (strength >= UCOL_QUATERNARY) { 260 quaternary = 0xFFFF; 261 } 262 263 isShifted = FALSE; 264 } 265 266 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 267} 268 269U_NAMESPACE_END 270 271/* public methods ---------------------------------------------------- */ 272 273U_CAPI UCollationElements* U_EXPORT2 274ucol_openElements(const UCollator *coll, 275 const UChar *text, 276 int32_t textLength, 277 UErrorCode *status) 278{ 279 if (U_FAILURE(*status)) { 280 return NULL; 281 } 282 if (coll == NULL || (text == NULL && textLength != 0)) { 283 *status = U_ILLEGAL_ARGUMENT_ERROR; 284 return NULL; 285 } 286 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 287 if (rbc == NULL) { 288 *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator 289 return NULL; 290 } 291 292 UnicodeString s((UBool)(textLength < 0), text, textLength); 293 CollationElementIterator *cei = rbc->createCollationElementIterator(s); 294 if (cei == NULL) { 295 *status = U_MEMORY_ALLOCATION_ERROR; 296 return NULL; 297 } 298 299 return cei->toUCollationElements(); 300} 301 302 303U_CAPI void U_EXPORT2 304ucol_closeElements(UCollationElements *elems) 305{ 306 delete CollationElementIterator::fromUCollationElements(elems); 307} 308 309U_CAPI void U_EXPORT2 310ucol_reset(UCollationElements *elems) 311{ 312 CollationElementIterator::fromUCollationElements(elems)->reset(); 313} 314 315U_CAPI int32_t U_EXPORT2 316ucol_next(UCollationElements *elems, 317 UErrorCode *status) 318{ 319 if (U_FAILURE(*status)) { 320 return UCOL_NULLORDER; 321 } 322 323 return CollationElementIterator::fromUCollationElements(elems)->next(*status); 324} 325 326U_NAMESPACE_BEGIN 327 328int64_t 329UCollationPCE::nextProcessed( 330 int32_t *ixLow, 331 int32_t *ixHigh, 332 UErrorCode *status) 333{ 334 int64_t result = UCOL_IGNORABLE; 335 uint32_t low = 0, high = 0; 336 337 if (U_FAILURE(*status)) { 338 return UCOL_PROCESSED_NULLORDER; 339 } 340 341 pceBuffer.reset(); 342 343 do { 344 low = cei->getOffset(); 345 int32_t ce = cei->next(*status); 346 high = cei->getOffset(); 347 348 if (ce == UCOL_NULLORDER) { 349 result = UCOL_PROCESSED_NULLORDER; 350 break; 351 } 352 353 result = processCE((uint32_t)ce); 354 } while (result == UCOL_IGNORABLE); 355 356 if (ixLow != NULL) { 357 *ixLow = low; 358 } 359 360 if (ixHigh != NULL) { 361 *ixHigh = high; 362 } 363 364 return result; 365} 366 367U_NAMESPACE_END 368 369U_CAPI int32_t U_EXPORT2 370ucol_previous(UCollationElements *elems, 371 UErrorCode *status) 372{ 373 if(U_FAILURE(*status)) { 374 return UCOL_NULLORDER; 375 } 376 return CollationElementIterator::fromUCollationElements(elems)->previous(*status); 377} 378 379U_NAMESPACE_BEGIN 380 381int64_t 382UCollationPCE::previousProcessed( 383 int32_t *ixLow, 384 int32_t *ixHigh, 385 UErrorCode *status) 386{ 387 int64_t result = UCOL_IGNORABLE; 388 int32_t low = 0, high = 0; 389 390 if (U_FAILURE(*status)) { 391 return UCOL_PROCESSED_NULLORDER; 392 } 393 394 // pceBuffer.reset(); 395 396 while (pceBuffer.isEmpty()) { 397 // buffer raw CEs up to non-ignorable primary 398 RCEBuffer rceb; 399 int32_t ce; 400 401 // **** do we need to reset rceb, or will it always be empty at this point **** 402 do { 403 high = cei->getOffset(); 404 ce = cei->previous(*status); 405 low = cei->getOffset(); 406 407 if (ce == UCOL_NULLORDER) { 408 if (!rceb.isEmpty()) { 409 break; 410 } 411 412 goto finish; 413 } 414 415 rceb.put((uint32_t)ce, low, high, *status); 416 } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce))); 417 418 // process the raw CEs 419 while (U_SUCCESS(*status) && !rceb.isEmpty()) { 420 const RCEI *rcei = rceb.get(); 421 422 result = processCE(rcei->ce); 423 424 if (result != UCOL_IGNORABLE) { 425 pceBuffer.put(result, rcei->low, rcei->high, *status); 426 } 427 } 428 if (U_FAILURE(*status)) { 429 return UCOL_PROCESSED_NULLORDER; 430 } 431 } 432 433finish: 434 if (pceBuffer.isEmpty()) { 435 // **** Is -1 the right value for ixLow, ixHigh? **** 436 if (ixLow != NULL) { 437 *ixLow = -1; 438 } 439 440 if (ixHigh != NULL) { 441 *ixHigh = -1 442 ; 443 } 444 return UCOL_PROCESSED_NULLORDER; 445 } 446 447 const PCEI *pcei = pceBuffer.get(); 448 449 if (ixLow != NULL) { 450 *ixLow = pcei->low; 451 } 452 453 if (ixHigh != NULL) { 454 *ixHigh = pcei->high; 455 } 456 457 return pcei->ce; 458} 459 460U_NAMESPACE_END 461 462U_CAPI int32_t U_EXPORT2 463ucol_getMaxExpansion(const UCollationElements *elems, 464 int32_t order) 465{ 466 return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order); 467 468 // TODO: The old code masked the order according to strength and then did a binary search. 469 // However this was probably at least partially broken because of the following comment. 470 // Still, it might have found a match when this version may not. 471 472 // FIXME: with a masked search, there might be more than one hit, 473 // so we need to look forward and backward from the match to find all 474 // of the hits... 475} 476 477U_CAPI void U_EXPORT2 478ucol_setText( UCollationElements *elems, 479 const UChar *text, 480 int32_t textLength, 481 UErrorCode *status) 482{ 483 if (U_FAILURE(*status)) { 484 return; 485 } 486 487 if ((text == NULL && textLength != 0)) { 488 *status = U_ILLEGAL_ARGUMENT_ERROR; 489 return; 490 } 491 UnicodeString s((UBool)(textLength < 0), text, textLength); 492 return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status); 493} 494 495U_CAPI int32_t U_EXPORT2 496ucol_getOffset(const UCollationElements *elems) 497{ 498 return CollationElementIterator::fromUCollationElements(elems)->getOffset(); 499} 500 501U_CAPI void U_EXPORT2 502ucol_setOffset(UCollationElements *elems, 503 int32_t offset, 504 UErrorCode *status) 505{ 506 if (U_FAILURE(*status)) { 507 return; 508 } 509 510 CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status); 511} 512 513U_CAPI int32_t U_EXPORT2 514ucol_primaryOrder (int32_t order) 515{ 516 return (order >> 16) & 0xffff; 517} 518 519U_CAPI int32_t U_EXPORT2 520ucol_secondaryOrder (int32_t order) 521{ 522 return (order >> 8) & 0xff; 523} 524 525U_CAPI int32_t U_EXPORT2 526ucol_tertiaryOrder (int32_t order) 527{ 528 return order & 0xff; 529} 530 531#endif /* #if !UCONFIG_NO_COLLATION */ 532