1/* 2****************************************************************************** 3* Copyright (C) 2001-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5****************************************************************************** 6* 7* File ucoleitr.cpp 8* 9* Modification History: 10* 11* Date Name Description 12* 02/15/2001 synwee Modified all methods to process its own function 13* instead of calling the equivalent c++ api (coleitr.h) 14* 2012-2014 markus Rewritten in C++ again. 15******************************************************************************/ 16 17#include "unicode/utypes.h" 18 19#if !UCONFIG_NO_COLLATION 20 21#include "unicode/coleitr.h" 22#include "unicode/tblcoll.h" 23#include "unicode/ucoleitr.h" 24#include "unicode/ustring.h" 25#include "unicode/sortkey.h" 26#include "unicode/uobject.h" 27#include "cmemory.h" 28#include "usrchimp.h" 29 30#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 31 32U_NAMESPACE_USE 33 34#define BUFFER_LENGTH 100 35 36#define DEFAULT_BUFFER_SIZE 16 37#define BUFFER_GROW 8 38 39#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 40 41#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 42 43#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 44 45#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0]) 46 47#define DELETE_ARRAY(array) uprv_free((void *) (array)) 48 49struct RCEI 50{ 51 uint32_t ce; 52 int32_t low; 53 int32_t high; 54}; 55 56U_NAMESPACE_BEGIN 57 58struct RCEBuffer 59{ 60 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 61 RCEI *buffer; 62 int32_t bufferIndex; 63 int32_t bufferSize; 64 65 RCEBuffer(); 66 ~RCEBuffer(); 67 68 UBool empty() const; 69 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh); 70 const RCEI *get(); 71}; 72 73RCEBuffer::RCEBuffer() 74{ 75 buffer = defaultBuffer; 76 bufferIndex = 0; 77 bufferSize = LENGTHOF(defaultBuffer); 78} 79 80RCEBuffer::~RCEBuffer() 81{ 82 if (buffer != defaultBuffer) { 83 DELETE_ARRAY(buffer); 84 } 85} 86 87UBool RCEBuffer::empty() const 88{ 89 return bufferIndex <= 0; 90} 91 92void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh) 93{ 94 if (bufferIndex >= bufferSize) { 95 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); 96 97 ARRAY_COPY(newBuffer, buffer, bufferSize); 98 99 if (buffer != defaultBuffer) { 100 DELETE_ARRAY(buffer); 101 } 102 103 buffer = newBuffer; 104 bufferSize += BUFFER_GROW; 105 } 106 107 buffer[bufferIndex].ce = ce; 108 buffer[bufferIndex].low = ixLow; 109 buffer[bufferIndex].high = ixHigh; 110 111 bufferIndex += 1; 112} 113 114const RCEI *RCEBuffer::get() 115{ 116 if (bufferIndex > 0) { 117 return &buffer[--bufferIndex]; 118 } 119 120 return NULL; 121} 122 123PCEBuffer::PCEBuffer() 124{ 125 buffer = defaultBuffer; 126 bufferIndex = 0; 127 bufferSize = LENGTHOF(defaultBuffer); 128} 129 130PCEBuffer::~PCEBuffer() 131{ 132 if (buffer != defaultBuffer) { 133 DELETE_ARRAY(buffer); 134 } 135} 136 137void PCEBuffer::reset() 138{ 139 bufferIndex = 0; 140} 141 142UBool PCEBuffer::empty() const 143{ 144 return bufferIndex <= 0; 145} 146 147void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh) 148{ 149 if (bufferIndex >= bufferSize) { 150 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); 151 152 ARRAY_COPY(newBuffer, buffer, bufferSize); 153 154 if (buffer != defaultBuffer) { 155 DELETE_ARRAY(buffer); 156 } 157 158 buffer = newBuffer; 159 bufferSize += BUFFER_GROW; 160 } 161 162 buffer[bufferIndex].ce = ce; 163 buffer[bufferIndex].low = ixLow; 164 buffer[bufferIndex].high = ixHigh; 165 166 bufferIndex += 1; 167} 168 169const PCEI *PCEBuffer::get() 170{ 171 if (bufferIndex > 0) { 172 return &buffer[--bufferIndex]; 173 } 174 175 return NULL; 176} 177 178UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); } 179 180UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); } 181 182void UCollationPCE::init(UCollationElements *elems) { 183 init(CollationElementIterator::fromUCollationElements(elems)); 184} 185 186void UCollationPCE::init(CollationElementIterator *iter) 187{ 188 cei = iter; 189 init(*iter->rbc_); 190} 191 192void UCollationPCE::init(const Collator &coll) 193{ 194 UErrorCode status = U_ZERO_ERROR; 195 196 strength = coll.getAttribute(UCOL_STRENGTH, status); 197 toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; 198 isShifted = FALSE; 199 variableTop = coll.getVariableTop(status); 200} 201 202UCollationPCE::~UCollationPCE() 203{ 204 // nothing to do 205} 206 207uint64_t UCollationPCE::processCE(uint32_t ce) 208{ 209 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 210 211 // This is clean, but somewhat slow... 212 // We could apply the mask to ce and then 213 // just get all three orders... 214 switch(strength) { 215 default: 216 tertiary = ucol_tertiaryOrder(ce); 217 /* note fall-through */ 218 219 case UCOL_SECONDARY: 220 secondary = ucol_secondaryOrder(ce); 221 /* note fall-through */ 222 223 case UCOL_PRIMARY: 224 primary = ucol_primaryOrder(ce); 225 } 226 227 // **** This should probably handle continuations too. **** 228 // **** That means that we need 24 bits for the primary **** 229 // **** instead of the 16 that we're currently using. **** 230 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 231 // **** Another complication with continuations is that **** 232 // **** the *second* CE is marked as a continuation, so **** 233 // **** we always have to peek ahead to know how long **** 234 // **** the primary is... **** 235 if ((toShift && variableTop > ce && primary != 0) 236 || (isShifted && primary == 0)) { 237 238 if (primary == 0) { 239 return UCOL_IGNORABLE; 240 } 241 242 if (strength >= UCOL_QUATERNARY) { 243 quaternary = primary; 244 } 245 246 primary = secondary = tertiary = 0; 247 isShifted = TRUE; 248 } else { 249 if (strength >= UCOL_QUATERNARY) { 250 quaternary = 0xFFFF; 251 } 252 253 isShifted = FALSE; 254 } 255 256 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 257} 258 259U_NAMESPACE_END 260 261/* public methods ---------------------------------------------------- */ 262 263U_CAPI UCollationElements* U_EXPORT2 264ucol_openElements(const UCollator *coll, 265 const UChar *text, 266 int32_t textLength, 267 UErrorCode *status) 268{ 269 if (U_FAILURE(*status)) { 270 return NULL; 271 } 272 if (coll == NULL || (text == NULL && textLength != 0)) { 273 *status = U_ILLEGAL_ARGUMENT_ERROR; 274 return NULL; 275 } 276 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 277 if (rbc == NULL) { 278 *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator 279 return NULL; 280 } 281 282 UnicodeString s((UBool)(textLength < 0), text, textLength); 283 CollationElementIterator *cei = rbc->createCollationElementIterator(s); 284 if (cei == NULL) { 285 *status = U_MEMORY_ALLOCATION_ERROR; 286 return NULL; 287 } 288 289 return cei->toUCollationElements(); 290} 291 292 293U_CAPI void U_EXPORT2 294ucol_closeElements(UCollationElements *elems) 295{ 296 delete CollationElementIterator::fromUCollationElements(elems); 297} 298 299U_CAPI void U_EXPORT2 300ucol_reset(UCollationElements *elems) 301{ 302 CollationElementIterator::fromUCollationElements(elems)->reset(); 303} 304 305U_CAPI int32_t U_EXPORT2 306ucol_next(UCollationElements *elems, 307 UErrorCode *status) 308{ 309 if (U_FAILURE(*status)) { 310 return UCOL_NULLORDER; 311 } 312 313 return CollationElementIterator::fromUCollationElements(elems)->next(*status); 314} 315 316U_NAMESPACE_BEGIN 317 318int64_t 319UCollationPCE::nextProcessed( 320 int32_t *ixLow, 321 int32_t *ixHigh, 322 UErrorCode *status) 323{ 324 int64_t result = UCOL_IGNORABLE; 325 uint32_t low = 0, high = 0; 326 327 if (U_FAILURE(*status)) { 328 return UCOL_PROCESSED_NULLORDER; 329 } 330 331 pceBuffer.reset(); 332 333 do { 334 low = cei->getOffset(); 335 int32_t ce = cei->next(*status); 336 high = cei->getOffset(); 337 338 if (ce == UCOL_NULLORDER) { 339 result = UCOL_PROCESSED_NULLORDER; 340 break; 341 } 342 343 result = processCE((uint32_t)ce); 344 } while (result == UCOL_IGNORABLE); 345 346 if (ixLow != NULL) { 347 *ixLow = low; 348 } 349 350 if (ixHigh != NULL) { 351 *ixHigh = high; 352 } 353 354 return result; 355} 356 357U_NAMESPACE_END 358 359U_CAPI int32_t U_EXPORT2 360ucol_previous(UCollationElements *elems, 361 UErrorCode *status) 362{ 363 if(U_FAILURE(*status)) { 364 return UCOL_NULLORDER; 365 } 366 return CollationElementIterator::fromUCollationElements(elems)->previous(*status); 367} 368 369U_NAMESPACE_BEGIN 370 371int64_t 372UCollationPCE::previousProcessed( 373 int32_t *ixLow, 374 int32_t *ixHigh, 375 UErrorCode *status) 376{ 377 int64_t result = UCOL_IGNORABLE; 378 int32_t low = 0, high = 0; 379 380 if (U_FAILURE(*status)) { 381 return UCOL_PROCESSED_NULLORDER; 382 } 383 384 // pceBuffer.reset(); 385 386 while (pceBuffer.empty()) { 387 // buffer raw CEs up to non-ignorable primary 388 RCEBuffer rceb; 389 int32_t ce; 390 391 // **** do we need to reset rceb, or will it always be empty at this point **** 392 do { 393 high = cei->getOffset(); 394 ce = cei->previous(*status); 395 low = cei->getOffset(); 396 397 if (ce == UCOL_NULLORDER) { 398 if (! rceb.empty()) { 399 break; 400 } 401 402 goto finish; 403 } 404 405 rceb.put((uint32_t)ce, low, high); 406 } while ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)); 407 408 // process the raw CEs 409 while (! rceb.empty()) { 410 const RCEI *rcei = rceb.get(); 411 412 result = processCE(rcei->ce); 413 414 if (result != UCOL_IGNORABLE) { 415 pceBuffer.put(result, rcei->low, rcei->high); 416 } 417 } 418 } 419 420finish: 421 if (pceBuffer.empty()) { 422 // **** Is -1 the right value for ixLow, ixHigh? **** 423 if (ixLow != NULL) { 424 *ixLow = -1; 425 } 426 427 if (ixHigh != NULL) { 428 *ixHigh = -1 429 ; 430 } 431 return UCOL_PROCESSED_NULLORDER; 432 } 433 434 const PCEI *pcei = pceBuffer.get(); 435 436 if (ixLow != NULL) { 437 *ixLow = pcei->low; 438 } 439 440 if (ixHigh != NULL) { 441 *ixHigh = pcei->high; 442 } 443 444 return pcei->ce; 445} 446 447U_NAMESPACE_END 448 449U_CAPI int32_t U_EXPORT2 450ucol_getMaxExpansion(const UCollationElements *elems, 451 int32_t order) 452{ 453 return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order); 454 455 // TODO: The old code masked the order according to strength and then did a binary search. 456 // However this was probably at least partially broken because of the following comment. 457 // Still, it might have found a match when this version may not. 458 459 // FIXME: with a masked search, there might be more than one hit, 460 // so we need to look forward and backward from the match to find all 461 // of the hits... 462} 463 464U_CAPI void U_EXPORT2 465ucol_setText( UCollationElements *elems, 466 const UChar *text, 467 int32_t textLength, 468 UErrorCode *status) 469{ 470 if (U_FAILURE(*status)) { 471 return; 472 } 473 474 if ((text == NULL && textLength != 0)) { 475 *status = U_ILLEGAL_ARGUMENT_ERROR; 476 return; 477 } 478 UnicodeString s((UBool)(textLength < 0), text, textLength); 479 return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status); 480} 481 482U_CAPI int32_t U_EXPORT2 483ucol_getOffset(const UCollationElements *elems) 484{ 485 return CollationElementIterator::fromUCollationElements(elems)->getOffset(); 486} 487 488U_CAPI void U_EXPORT2 489ucol_setOffset(UCollationElements *elems, 490 int32_t offset, 491 UErrorCode *status) 492{ 493 if (U_FAILURE(*status)) { 494 return; 495 } 496 497 CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status); 498} 499 500U_CAPI int32_t U_EXPORT2 501ucol_primaryOrder (int32_t order) 502{ 503 return (order >> 16) & 0xffff; 504} 505 506U_CAPI int32_t U_EXPORT2 507ucol_secondaryOrder (int32_t order) 508{ 509 return (order >> 8) & 0xff; 510} 511 512U_CAPI int32_t U_EXPORT2 513ucol_tertiaryOrder (int32_t order) 514{ 515 return order & 0xff; 516} 517 518#endif /* #if !UCONFIG_NO_COLLATION */ 519