1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2010-2014, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* utf16collationiterator.cpp 9* 10* created on: 2010oct27 11* created by: Markus W. Scherer 12*/ 13 14#include "unicode/utypes.h" 15 16#if !UCONFIG_NO_COLLATION 17 18#include "charstr.h" 19#include "cmemory.h" 20#include "collation.h" 21#include "collationdata.h" 22#include "collationfcd.h" 23#include "collationiterator.h" 24#include "normalizer2impl.h" 25#include "uassert.h" 26#include "utf16collationiterator.h" 27 28U_NAMESPACE_BEGIN 29 30UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other, 31 const UChar *newText) 32 : CollationIterator(other), 33 start(newText), 34 pos(newText + (other.pos - other.start)), 35 limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) { 36} 37 38UTF16CollationIterator::~UTF16CollationIterator() {} 39 40UBool 41UTF16CollationIterator::operator==(const CollationIterator &other) const { 42 if(!CollationIterator::operator==(other)) { return FALSE; } 43 const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other); 44 // Compare the iterator state but not the text: Assume that the caller does that. 45 return (pos - start) == (o.pos - o.start); 46} 47 48void 49UTF16CollationIterator::resetToOffset(int32_t newOffset) { 50 reset(); 51 pos = start + newOffset; 52} 53 54int32_t 55UTF16CollationIterator::getOffset() const { 56 return (int32_t)(pos - start); 57} 58 59uint32_t 60UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { 61 if(pos == limit) { 62 c = U_SENTINEL; 63 return Collation::FALLBACK_CE32; 64 } 65 c = *pos++; 66 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 67} 68 69UChar 70UTF16CollationIterator::handleGetTrailSurrogate() { 71 if(pos == limit) { return 0; } 72 UChar trail; 73 if(U16_IS_TRAIL(trail = *pos)) { ++pos; } 74 return trail; 75} 76 77UBool 78UTF16CollationIterator::foundNULTerminator() { 79 if(limit == NULL) { 80 limit = --pos; 81 return TRUE; 82 } else { 83 return FALSE; 84 } 85} 86 87UChar32 88UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 89 if(pos == limit) { 90 return U_SENTINEL; 91 } 92 UChar32 c = *pos; 93 if(c == 0 && limit == NULL) { 94 limit = pos; 95 return U_SENTINEL; 96 } 97 ++pos; 98 UChar trail; 99 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) { 100 ++pos; 101 return U16_GET_SUPPLEMENTARY(c, trail); 102 } else { 103 return c; 104 } 105} 106 107UChar32 108UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 109 if(pos == start) { 110 return U_SENTINEL; 111 } 112 UChar32 c = *--pos; 113 UChar lead; 114 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) { 115 --pos; 116 return U16_GET_SUPPLEMENTARY(lead, c); 117 } else { 118 return c; 119 } 120} 121 122void 123UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 124 while(num > 0 && pos != limit) { 125 UChar32 c = *pos; 126 if(c == 0 && limit == NULL) { 127 limit = pos; 128 break; 129 } 130 ++pos; 131 --num; 132 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) { 133 ++pos; 134 } 135 } 136} 137 138void 139UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 140 while(num > 0 && pos != start) { 141 UChar32 c = *--pos; 142 --num; 143 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) { 144 --pos; 145 } 146 } 147} 148 149// FCDUTF16CollationIterator ----------------------------------------------- *** 150 151FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other, 152 const UChar *newText) 153 : UTF16CollationIterator(other), 154 rawStart(newText), 155 segmentStart(newText + (other.segmentStart - other.rawStart)), 156 segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)), 157 rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)), 158 nfcImpl(other.nfcImpl), 159 normalized(other.normalized), 160 checkDir(other.checkDir) { 161 if(checkDir != 0 || other.start == other.segmentStart) { 162 start = newText + (other.start - other.rawStart); 163 pos = newText + (other.pos - other.rawStart); 164 limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart); 165 } else { 166 start = normalized.getBuffer(); 167 pos = start + (other.pos - other.start); 168 limit = start + normalized.length(); 169 } 170} 171 172FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {} 173 174UBool 175FCDUTF16CollationIterator::operator==(const CollationIterator &other) const { 176 // Skip the UTF16CollationIterator and call its parent. 177 if(!CollationIterator::operator==(other)) { return FALSE; } 178 const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other); 179 // Compare the iterator state but not the text: Assume that the caller does that. 180 if(checkDir != o.checkDir) { return FALSE; } 181 if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; } 182 if(checkDir != 0 || start == segmentStart) { 183 return (pos - rawStart) == (o.pos - o.rawStart); 184 } else { 185 return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) && 186 (pos - start) == (o.pos - o.start); 187 } 188} 189 190void 191FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) { 192 reset(); 193 start = segmentStart = pos = rawStart + newOffset; 194 limit = rawLimit; 195 checkDir = 1; 196} 197 198int32_t 199FCDUTF16CollationIterator::getOffset() const { 200 if(checkDir != 0 || start == segmentStart) { 201 return (int32_t)(pos - rawStart); 202 } else if(pos == start) { 203 return (int32_t)(segmentStart - rawStart); 204 } else { 205 return (int32_t)(segmentLimit - rawStart); 206 } 207} 208 209uint32_t 210FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { 211 for(;;) { 212 if(checkDir > 0) { 213 if(pos == limit) { 214 c = U_SENTINEL; 215 return Collation::FALLBACK_CE32; 216 } 217 c = *pos++; 218 if(CollationFCD::hasTccc(c)) { 219 if(CollationFCD::maybeTibetanCompositeVowel(c) || 220 (pos != limit && CollationFCD::hasLccc(*pos))) { 221 --pos; 222 if(!nextSegment(errorCode)) { 223 c = U_SENTINEL; 224 return Collation::FALLBACK_CE32; 225 } 226 c = *pos++; 227 } 228 } 229 break; 230 } else if(checkDir == 0 && pos != limit) { 231 c = *pos++; 232 break; 233 } else { 234 switchToForward(); 235 } 236 } 237 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); 238} 239 240UBool 241FCDUTF16CollationIterator::foundNULTerminator() { 242 if(limit == NULL) { 243 limit = rawLimit = --pos; 244 return TRUE; 245 } else { 246 return FALSE; 247 } 248} 249 250UChar32 251FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) { 252 UChar32 c; 253 for(;;) { 254 if(checkDir > 0) { 255 if(pos == limit) { 256 return U_SENTINEL; 257 } 258 c = *pos++; 259 if(CollationFCD::hasTccc(c)) { 260 if(CollationFCD::maybeTibetanCompositeVowel(c) || 261 (pos != limit && CollationFCD::hasLccc(*pos))) { 262 --pos; 263 if(!nextSegment(errorCode)) { 264 return U_SENTINEL; 265 } 266 c = *pos++; 267 } 268 } else if(c == 0 && limit == NULL) { 269 limit = rawLimit = --pos; 270 return U_SENTINEL; 271 } 272 break; 273 } else if(checkDir == 0 && pos != limit) { 274 c = *pos++; 275 break; 276 } else { 277 switchToForward(); 278 } 279 } 280 UChar trail; 281 if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) { 282 ++pos; 283 return U16_GET_SUPPLEMENTARY(c, trail); 284 } else { 285 return c; 286 } 287} 288 289UChar32 290FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) { 291 UChar32 c; 292 for(;;) { 293 if(checkDir < 0) { 294 if(pos == start) { 295 return U_SENTINEL; 296 } 297 c = *--pos; 298 if(CollationFCD::hasLccc(c)) { 299 if(CollationFCD::maybeTibetanCompositeVowel(c) || 300 (pos != start && CollationFCD::hasTccc(*(pos - 1)))) { 301 ++pos; 302 if(!previousSegment(errorCode)) { 303 return U_SENTINEL; 304 } 305 c = *--pos; 306 } 307 } 308 break; 309 } else if(checkDir == 0 && pos != start) { 310 c = *--pos; 311 break; 312 } else { 313 switchToBackward(); 314 } 315 } 316 UChar lead; 317 if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) { 318 --pos; 319 return U16_GET_SUPPLEMENTARY(lead, c); 320 } else { 321 return c; 322 } 323} 324 325void 326FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 327 // Specify the class to avoid a virtual-function indirection. 328 // In Java, we would declare this class final. 329 while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) { 330 --num; 331 } 332} 333 334void 335FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { 336 // Specify the class to avoid a virtual-function indirection. 337 // In Java, we would declare this class final. 338 while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) { 339 --num; 340 } 341} 342 343void 344FCDUTF16CollationIterator::switchToForward() { 345 U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit)); 346 if(checkDir < 0) { 347 // Turn around from backward checking. 348 start = segmentStart = pos; 349 if(pos == segmentLimit) { 350 limit = rawLimit; 351 checkDir = 1; // Check forward. 352 } else { // pos < segmentLimit 353 checkDir = 0; // Stay in FCD segment. 354 } 355 } else { 356 // Reached the end of the FCD segment. 357 if(start == segmentStart) { 358 // The input text segment is FCD, extend it forward. 359 } else { 360 // The input text segment needed to be normalized. 361 // Switch to checking forward from it. 362 pos = start = segmentStart = segmentLimit; 363 // Note: If this segment is at the end of the input text, 364 // then it might help to return FALSE to indicate that, so that 365 // we do not have to re-check and normalize when we turn around and go backwards. 366 // However, that would complicate the call sites for an optimization of an unusual case. 367 } 368 limit = rawLimit; 369 checkDir = 1; 370 } 371} 372 373UBool 374FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) { 375 if(U_FAILURE(errorCode)) { return FALSE; } 376 U_ASSERT(checkDir > 0 && pos != limit); 377 // The input text [segmentStart..pos[ passes the FCD check. 378 const UChar *p = pos; 379 uint8_t prevCC = 0; 380 for(;;) { 381 // Fetch the next character's fcd16 value. 382 const UChar *q = p; 383 uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit); 384 uint8_t leadCC = (uint8_t)(fcd16 >> 8); 385 if(leadCC == 0 && q != pos) { 386 // FCD boundary before the [q, p[ character. 387 limit = segmentLimit = q; 388 break; 389 } 390 if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 391 // Fails FCD check. Find the next FCD boundary and normalize. 392 do { 393 q = p; 394 } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff); 395 if(!normalize(pos, q, errorCode)) { return FALSE; } 396 pos = start; 397 break; 398 } 399 prevCC = (uint8_t)fcd16; 400 if(p == rawLimit || prevCC == 0) { 401 // FCD boundary after the last character. 402 limit = segmentLimit = p; 403 break; 404 } 405 } 406 U_ASSERT(pos != limit); 407 checkDir = 0; 408 return TRUE; 409} 410 411void 412FCDUTF16CollationIterator::switchToBackward() { 413 U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start)); 414 if(checkDir > 0) { 415 // Turn around from forward checking. 416 limit = segmentLimit = pos; 417 if(pos == segmentStart) { 418 start = rawStart; 419 checkDir = -1; // Check backward. 420 } else { // pos > segmentStart 421 checkDir = 0; // Stay in FCD segment. 422 } 423 } else { 424 // Reached the start of the FCD segment. 425 if(start == segmentStart) { 426 // The input text segment is FCD, extend it backward. 427 } else { 428 // The input text segment needed to be normalized. 429 // Switch to checking backward from it. 430 pos = limit = segmentLimit = segmentStart; 431 } 432 start = rawStart; 433 checkDir = -1; 434 } 435} 436 437UBool 438FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) { 439 if(U_FAILURE(errorCode)) { return FALSE; } 440 U_ASSERT(checkDir < 0 && pos != start); 441 // The input text [pos..segmentLimit[ passes the FCD check. 442 const UChar *p = pos; 443 uint8_t nextCC = 0; 444 for(;;) { 445 // Fetch the previous character's fcd16 value. 446 const UChar *q = p; 447 uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p); 448 uint8_t trailCC = (uint8_t)fcd16; 449 if(trailCC == 0 && q != pos) { 450 // FCD boundary after the [p, q[ character. 451 start = segmentStart = q; 452 break; 453 } 454 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 455 CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { 456 // Fails FCD check. Find the previous FCD boundary and normalize. 457 do { 458 q = p; 459 } while(fcd16 > 0xff && p != rawStart && 460 (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0); 461 if(!normalize(q, pos, errorCode)) { return FALSE; } 462 pos = limit; 463 break; 464 } 465 nextCC = (uint8_t)(fcd16 >> 8); 466 if(p == rawStart || nextCC == 0) { 467 // FCD boundary before the following character. 468 start = segmentStart = p; 469 break; 470 } 471 } 472 U_ASSERT(pos != start); 473 checkDir = 0; 474 return TRUE; 475} 476 477UBool 478FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) { 479 // NFD without argument checking. 480 U_ASSERT(U_SUCCESS(errorCode)); 481 nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode); 482 if(U_FAILURE(errorCode)) { return FALSE; } 483 // Switch collation processing into the FCD buffer 484 // with the result of normalizing [segmentStart, segmentLimit[. 485 segmentStart = from; 486 segmentLimit = to; 487 start = normalized.getBuffer(); 488 limit = start + normalized.length(); 489 return TRUE; 490} 491 492U_NAMESPACE_END 493 494#endif // !UCONFIG_NO_COLLATION 495