1/* 2******************************************************************************* 3* 4* Copyright (C) 2009-2013, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: normalizer2impl.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2009nov22 14* created by: Markus W. Scherer 15*/ 16 17#include "unicode/utypes.h" 18 19#if !UCONFIG_NO_NORMALIZATION 20 21#include "unicode/normalizer2.h" 22#include "unicode/udata.h" 23#include "unicode/ustring.h" 24#include "unicode/utf16.h" 25#include "cmemory.h" 26#include "mutex.h" 27#include "normalizer2impl.h" 28#include "putilimp.h" 29#include "uassert.h" 30#include "uset_imp.h" 31#include "utrie2.h" 32#include "uvector.h" 33 34U_NAMESPACE_BEGIN 35 36// ReorderingBuffer -------------------------------------------------------- *** 37 38UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 39 int32_t length=str.length(); 40 start=str.getBuffer(destCapacity); 41 if(start==NULL) { 42 // getBuffer() already did str.setToBogus() 43 errorCode=U_MEMORY_ALLOCATION_ERROR; 44 return FALSE; 45 } 46 limit=start+length; 47 remainingCapacity=str.getCapacity()-length; 48 reorderStart=start; 49 if(start==limit) { 50 lastCC=0; 51 } else { 52 setIterator(); 53 lastCC=previousCC(); 54 // Set reorderStart after the last code point with cc<=1 if there is one. 55 if(lastCC>1) { 56 while(previousCC()>1) {} 57 } 58 reorderStart=codePointLimit; 59 } 60 return TRUE; 61} 62 63UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { 64 int32_t length=(int32_t)(limit-start); 65 return 66 length==(int32_t)(otherLimit-otherStart) && 67 0==u_memcmp(start, otherStart, length); 68} 69 70UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 71 if(remainingCapacity<2 && !resize(2, errorCode)) { 72 return FALSE; 73 } 74 if(lastCC<=cc || cc==0) { 75 limit[0]=U16_LEAD(c); 76 limit[1]=U16_TRAIL(c); 77 limit+=2; 78 lastCC=cc; 79 if(cc<=1) { 80 reorderStart=limit; 81 } 82 } else { 83 insert(c, cc); 84 } 85 remainingCapacity-=2; 86 return TRUE; 87} 88 89UBool ReorderingBuffer::append(const UChar *s, int32_t length, 90 uint8_t leadCC, uint8_t trailCC, 91 UErrorCode &errorCode) { 92 if(length==0) { 93 return TRUE; 94 } 95 if(remainingCapacity<length && !resize(length, errorCode)) { 96 return FALSE; 97 } 98 remainingCapacity-=length; 99 if(lastCC<=leadCC || leadCC==0) { 100 if(trailCC<=1) { 101 reorderStart=limit+length; 102 } else if(leadCC<=1) { 103 reorderStart=limit+1; // Ok if not a code point boundary. 104 } 105 const UChar *sLimit=s+length; 106 do { *limit++=*s++; } while(s!=sLimit); 107 lastCC=trailCC; 108 } else { 109 int32_t i=0; 110 UChar32 c; 111 U16_NEXT(s, i, length, c); 112 insert(c, leadCC); // insert first code point 113 while(i<length) { 114 U16_NEXT(s, i, length, c); 115 if(i<length) { 116 // s must be in NFD, otherwise we need to use getCC(). 117 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 118 } else { 119 leadCC=trailCC; 120 } 121 append(c, leadCC, errorCode); 122 } 123 } 124 return TRUE; 125} 126 127UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 128 int32_t cpLength=U16_LENGTH(c); 129 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 130 return FALSE; 131 } 132 remainingCapacity-=cpLength; 133 if(cpLength==1) { 134 *limit++=(UChar)c; 135 } else { 136 limit[0]=U16_LEAD(c); 137 limit[1]=U16_TRAIL(c); 138 limit+=2; 139 } 140 lastCC=0; 141 reorderStart=limit; 142 return TRUE; 143} 144 145UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { 146 if(s==sLimit) { 147 return TRUE; 148 } 149 int32_t length=(int32_t)(sLimit-s); 150 if(remainingCapacity<length && !resize(length, errorCode)) { 151 return FALSE; 152 } 153 u_memcpy(limit, s, length); 154 limit+=length; 155 remainingCapacity-=length; 156 lastCC=0; 157 reorderStart=limit; 158 return TRUE; 159} 160 161void ReorderingBuffer::remove() { 162 reorderStart=limit=start; 163 remainingCapacity=str.getCapacity(); 164 lastCC=0; 165} 166 167void ReorderingBuffer::removeSuffix(int32_t suffixLength) { 168 if(suffixLength<(limit-start)) { 169 limit-=suffixLength; 170 remainingCapacity+=suffixLength; 171 } else { 172 limit=start; 173 remainingCapacity=str.getCapacity(); 174 } 175 lastCC=0; 176 reorderStart=limit; 177} 178 179UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 180 int32_t reorderStartIndex=(int32_t)(reorderStart-start); 181 int32_t length=(int32_t)(limit-start); 182 str.releaseBuffer(length); 183 int32_t newCapacity=length+appendLength; 184 int32_t doubleCapacity=2*str.getCapacity(); 185 if(newCapacity<doubleCapacity) { 186 newCapacity=doubleCapacity; 187 } 188 if(newCapacity<256) { 189 newCapacity=256; 190 } 191 start=str.getBuffer(newCapacity); 192 if(start==NULL) { 193 // getBuffer() already did str.setToBogus() 194 errorCode=U_MEMORY_ALLOCATION_ERROR; 195 return FALSE; 196 } 197 reorderStart=start+reorderStartIndex; 198 limit=start+length; 199 remainingCapacity=str.getCapacity()-length; 200 return TRUE; 201} 202 203void ReorderingBuffer::skipPrevious() { 204 codePointLimit=codePointStart; 205 UChar c=*--codePointStart; 206 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 207 --codePointStart; 208 } 209} 210 211uint8_t ReorderingBuffer::previousCC() { 212 codePointLimit=codePointStart; 213 if(reorderStart>=codePointStart) { 214 return 0; 215 } 216 UChar32 c=*--codePointStart; 217 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { 218 return 0; 219 } 220 221 UChar c2; 222 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 223 --codePointStart; 224 c=U16_GET_SUPPLEMENTARY(c2, c); 225 } 226 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 227} 228 229// Inserts c somewhere before the last character. 230// Requires 0<cc<lastCC which implies reorderStart<limit. 231void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 232 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 233 // insert c at codePointLimit, after the character with prevCC<=cc 234 UChar *q=limit; 235 UChar *r=limit+=U16_LENGTH(c); 236 do { 237 *--r=*--q; 238 } while(codePointLimit!=q); 239 writeCodePoint(q, c); 240 if(cc<=1) { 241 reorderStart=r; 242 } 243} 244 245// Normalizer2Impl --------------------------------------------------------- *** 246 247struct CanonIterData : public UMemory { 248 CanonIterData(UErrorCode &errorCode); 249 ~CanonIterData(); 250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 251 UTrie2 *trie; 252 UVector canonStartSets; // contains UnicodeSet * 253}; 254 255Normalizer2Impl::~Normalizer2Impl() { 256 udata_close(memory); 257 utrie2_close(normTrie); 258 delete fCanonIterData; 259} 260 261UBool U_CALLCONV 262Normalizer2Impl::isAcceptable(void *context, 263 const char * /* type */, const char * /*name*/, 264 const UDataInfo *pInfo) { 265 if( 266 pInfo->size>=20 && 267 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 268 pInfo->charsetFamily==U_CHARSET_FAMILY && 269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 270 pInfo->dataFormat[1]==0x72 && 271 pInfo->dataFormat[2]==0x6d && 272 pInfo->dataFormat[3]==0x32 && 273 pInfo->formatVersion[0]==2 274 ) { 275 Normalizer2Impl *me=(Normalizer2Impl *)context; 276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); 277 return TRUE; 278 } else { 279 return FALSE; 280 } 281} 282 283void 284Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { 285 if(U_FAILURE(errorCode)) { 286 return; 287 } 288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); 289 if(U_FAILURE(errorCode)) { 290 return; 291 } 292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); 293 const int32_t *inIndexes=(const int32_t *)inBytes; 294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; 295 if(indexesLength<=IX_MIN_MAYBE_YES) { 296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. 297 return; 298 } 299 300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 302 303 minYesNo=inIndexes[IX_MIN_YES_NO]; 304 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 305 minNoNo=inIndexes[IX_MIN_NO_NO]; 306 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 307 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 308 309 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; 310 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 311 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 312 inBytes+offset, nextOffset-offset, NULL, 313 &errorCode); 314 if(U_FAILURE(errorCode)) { 315 return; 316 } 317 318 offset=nextOffset; 319 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 320 maybeYesCompositions=(const uint16_t *)(inBytes+offset); 321 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); 322 323 // smallFCD: new in formatVersion 2 324 offset=nextOffset; 325 smallFCD=inBytes+offset; 326 327 // Build tccc180[]. 328 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 329 uint8_t bits=0; 330 for(UChar c=0; c<0x180; bits>>=1) { 331 if((c&0xff)==0) { 332 bits=smallFCD[c>>8]; // one byte per 0x100 code points 333 } 334 if(bits&1) { 335 for(int i=0; i<0x20; ++i, ++c) { 336 tccc180[c]=(uint8_t)getFCD16FromNormData(c); 337 } 338 } else { 339 uprv_memset(tccc180+c, 0, 0x20); 340 c+=0x20; 341 } 342 } 343} 344 345uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { 346 UChar32 c; 347 if(cpStart==(cpLimit-1)) { 348 c=*cpStart; 349 } else { 350 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); 351 } 352 uint16_t prevNorm16=getNorm16(c); 353 if(prevNorm16<=minYesNo) { 354 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 355 } else { 356 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo 357 } 358} 359 360U_CDECL_BEGIN 361 362static UBool U_CALLCONV 363enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 364 /* add the start code point to the USet */ 365 const USetAdder *sa=(const USetAdder *)context; 366 sa->add(sa->set, start); 367 return TRUE; 368} 369 370static uint32_t U_CALLCONV 371segmentStarterMapper(const void * /*context*/, uint32_t value) { 372 return value&CANON_NOT_SEGMENT_STARTER; 373} 374 375U_CDECL_END 376 377void 378Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 379 /* add the start code point of each same-value range of each trie */ 380 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); 381 382 /* add Hangul LV syllables and LV+1 because of skippables */ 383 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 384 sa->add(sa->set, c); 385 sa->add(sa->set, c+1); 386 } 387 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 388} 389 390void 391Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 392 /* add the start code point of each same-value range of the canonical iterator data trie */ 393 if(ensureCanonIterData(errorCode)) { 394 // currently only used for the SEGMENT_STARTER property 395 utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa); 396 } 397} 398 399const UChar * 400Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, 401 UChar32 minNeedDataCP, 402 ReorderingBuffer *buffer, 403 UErrorCode &errorCode) const { 404 // Make some effort to support NUL-terminated strings reasonably. 405 // Take the part of the fast quick check loop that does not look up 406 // data and check the first part of the string. 407 // After this prefix, determine the string length to simplify the rest 408 // of the code. 409 const UChar *prevSrc=src; 410 UChar c; 411 while((c=*src++)<minNeedDataCP && c!=0) {} 412 // Back out the last character for full processing. 413 // Copy this prefix. 414 if(--src!=prevSrc) { 415 if(buffer!=NULL) { 416 buffer->appendZeroCC(prevSrc, src, errorCode); 417 } 418 } 419 return src; 420} 421 422// Dual functionality: 423// buffer!=NULL: normalize 424// buffer==NULL: isNormalized/spanQuickCheckYes 425const UChar * 426Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 427 ReorderingBuffer *buffer, 428 UErrorCode &errorCode) const { 429 UChar32 minNoCP=minDecompNoCP; 430 if(limit==NULL) { 431 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 432 if(U_FAILURE(errorCode)) { 433 return src; 434 } 435 limit=u_strchr(src, 0); 436 } 437 438 const UChar *prevSrc; 439 UChar32 c=0; 440 uint16_t norm16=0; 441 442 // only for quick check 443 const UChar *prevBoundary=src; 444 uint8_t prevCC=0; 445 446 for(;;) { 447 // count code units below the minimum or with irrelevant data for the quick check 448 for(prevSrc=src; src!=limit;) { 449 if( (c=*src)<minNoCP || 450 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 451 ) { 452 ++src; 453 } else if(!U16_IS_SURROGATE(c)) { 454 break; 455 } else { 456 UChar c2; 457 if(U16_IS_SURROGATE_LEAD(c)) { 458 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 459 c=U16_GET_SUPPLEMENTARY(c, c2); 460 } 461 } else /* trail surrogate */ { 462 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 463 --src; 464 c=U16_GET_SUPPLEMENTARY(c2, c); 465 } 466 } 467 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 468 src+=U16_LENGTH(c); 469 } else { 470 break; 471 } 472 } 473 } 474 // copy these code units all at once 475 if(src!=prevSrc) { 476 if(buffer!=NULL) { 477 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 478 break; 479 } 480 } else { 481 prevCC=0; 482 prevBoundary=src; 483 } 484 } 485 if(src==limit) { 486 break; 487 } 488 489 // Check one above-minimum, relevant code point. 490 src+=U16_LENGTH(c); 491 if(buffer!=NULL) { 492 if(!decompose(c, norm16, *buffer, errorCode)) { 493 break; 494 } 495 } else { 496 if(isDecompYes(norm16)) { 497 uint8_t cc=getCCFromYesOrMaybe(norm16); 498 if(prevCC<=cc || cc==0) { 499 prevCC=cc; 500 if(cc<=1) { 501 prevBoundary=src; 502 } 503 continue; 504 } 505 } 506 return prevBoundary; // "no" or cc out of order 507 } 508 } 509 return src; 510} 511 512// Decompose a short piece of text which is likely to contain characters that 513// fail the quick check loop and/or where the quick check loop's overhead 514// is unlikely to be amortized. 515// Called by the compose() and makeFCD() implementations. 516UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, 517 ReorderingBuffer &buffer, 518 UErrorCode &errorCode) const { 519 while(src<limit) { 520 UChar32 c; 521 uint16_t norm16; 522 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); 523 if(!decompose(c, norm16, buffer, errorCode)) { 524 return FALSE; 525 } 526 } 527 return TRUE; 528} 529 530UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 531 ReorderingBuffer &buffer, 532 UErrorCode &errorCode) const { 533 // Only loops for 1:1 algorithmic mappings. 534 for(;;) { 535 // get the decomposition and the lead and trail cc's 536 if(isDecompYes(norm16)) { 537 // c does not decompose 538 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 539 } else if(isHangul(norm16)) { 540 // Hangul syllable: decompose algorithmically 541 UChar jamos[3]; 542 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 543 } else if(isDecompNoAlgorithmic(norm16)) { 544 c=mapAlgorithmic(c, norm16); 545 norm16=getNorm16(c); 546 } else { 547 // c decomposes, get everything from the variable-length extra data 548 const uint16_t *mapping=getMapping(norm16); 549 uint16_t firstUnit=*mapping; 550 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 551 uint8_t leadCC, trailCC; 552 trailCC=(uint8_t)(firstUnit>>8); 553 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 554 leadCC=(uint8_t)(*(mapping-1)>>8); 555 } else { 556 leadCC=0; 557 } 558 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); 559 } 560 } 561} 562 563const UChar * 564Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { 565 const UChar *decomp=NULL; 566 uint16_t norm16; 567 for(;;) { 568 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 569 // c does not decompose 570 return decomp; 571 } else if(isHangul(norm16)) { 572 // Hangul syllable: decompose algorithmically 573 length=Hangul::decompose(c, buffer); 574 return buffer; 575 } else if(isDecompNoAlgorithmic(norm16)) { 576 c=mapAlgorithmic(c, norm16); 577 decomp=buffer; 578 length=0; 579 U16_APPEND_UNSAFE(buffer, length, c); 580 } else { 581 // c decomposes, get everything from the variable-length extra data 582 const uint16_t *mapping=getMapping(norm16); 583 length=*mapping&MAPPING_LENGTH_MASK; 584 return (const UChar *)mapping+1; 585 } 586 } 587} 588 589// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 590// so that a raw mapping fits that consists of one unit ("rm0") 591// plus all but the first two code units of the normal mapping. 592// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. 593const UChar * 594Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const { 595 // We do not loop in this method because an algorithmic mapping itself 596 // becomes a final result rather than having to be decomposed recursively. 597 uint16_t norm16; 598 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 599 // c does not decompose 600 return NULL; 601 } else if(isHangul(norm16)) { 602 // Hangul syllable: decompose algorithmically 603 Hangul::getRawDecomposition(c, buffer); 604 length=2; 605 return buffer; 606 } else if(isDecompNoAlgorithmic(norm16)) { 607 c=mapAlgorithmic(c, norm16); 608 length=0; 609 U16_APPEND_UNSAFE(buffer, length, c); 610 return buffer; 611 } else { 612 // c decomposes, get everything from the variable-length extra data 613 const uint16_t *mapping=getMapping(norm16); 614 uint16_t firstUnit=*mapping; 615 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 616 if(firstUnit&MAPPING_HAS_RAW_MAPPING) { 617 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 618 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 619 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1; 620 uint16_t rm0=*rawMapping; 621 if(rm0<=MAPPING_LENGTH_MASK) { 622 length=rm0; 623 return (const UChar *)rawMapping-rm0; 624 } else { 625 // Copy the normal mapping and replace its first two code units with rm0. 626 buffer[0]=(UChar)rm0; 627 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); 628 length=mLength-1; 629 return buffer; 630 } 631 } else { 632 length=mLength; 633 return (const UChar *)mapping+1; 634 } 635 } 636} 637 638void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, 639 UBool doDecompose, 640 UnicodeString &safeMiddle, 641 ReorderingBuffer &buffer, 642 UErrorCode &errorCode) const { 643 buffer.copyReorderableSuffixTo(safeMiddle); 644 if(doDecompose) { 645 decompose(src, limit, &buffer, errorCode); 646 return; 647 } 648 // Just merge the strings at the boundary. 649 ForwardUTrie2StringIterator iter(normTrie, src, limit); 650 uint8_t firstCC, prevCC, cc; 651 firstCC=prevCC=cc=getCC(iter.next16()); 652 while(cc!=0) { 653 prevCC=cc; 654 cc=getCC(iter.next16()); 655 }; 656 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 657 limit=u_strchr(iter.codePointStart, 0); 658 } 659 660 if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { 661 buffer.appendZeroCC(iter.codePointStart, limit, errorCode); 662 } 663} 664 665// Note: hasDecompBoundary() could be implemented as aliases to 666// hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 667// at the cost of building the FCD trie for a decomposition normalizer. 668UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { 669 for(;;) { 670 if(c<minDecompNoCP) { 671 return TRUE; 672 } 673 uint16_t norm16=getNorm16(c); 674 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 675 return TRUE; 676 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 677 return FALSE; // ccc!=0 678 } else if(isDecompNoAlgorithmic(norm16)) { 679 c=mapAlgorithmic(c, norm16); 680 } else { 681 // c decomposes, get everything from the variable-length extra data 682 const uint16_t *mapping=getMapping(norm16); 683 uint16_t firstUnit=*mapping; 684 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 685 return FALSE; 686 } 687 if(!before) { 688 // decomp after-boundary: same as hasFCDBoundaryAfter(), 689 // fcd16<=1 || trailCC==0 690 if(firstUnit>0x1ff) { 691 return FALSE; // trailCC>1 692 } 693 if(firstUnit<=0xff) { 694 return TRUE; // trailCC==0 695 } 696 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 697 } 698 // TRUE if leadCC==0 (hasFCDBoundaryBefore()) 699 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 700 } 701 } 702} 703 704/* 705 * Finds the recomposition result for 706 * a forward-combining "lead" character, 707 * specified with a pointer to its compositions list, 708 * and a backward-combining "trail" character. 709 * 710 * If the lead and trail characters combine, then this function returns 711 * the following "compositeAndFwd" value: 712 * Bits 21..1 composite character 713 * Bit 0 set if the composite is a forward-combining starter 714 * otherwise it returns -1. 715 * 716 * The compositions list has (trail, compositeAndFwd) pair entries, 717 * encoded as either pairs or triples of 16-bit units. 718 * The last entry has the high bit of its first unit set. 719 * 720 * The list is sorted by ascending trail characters (there are no duplicates). 721 * A linear search is used. 722 * 723 * See normalizer2impl.h for a more detailed description 724 * of the compositions list format. 725 */ 726int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 727 uint16_t key1, firstUnit; 728 if(trail<COMP_1_TRAIL_LIMIT) { 729 // trail character is 0..33FF 730 // result entry may have 2 or 3 units 731 key1=(uint16_t)(trail<<1); 732 while(key1>(firstUnit=*list)) { 733 list+=2+(firstUnit&COMP_1_TRIPLE); 734 } 735 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 736 if(firstUnit&COMP_1_TRIPLE) { 737 return ((int32_t)list[1]<<16)|list[2]; 738 } else { 739 return list[1]; 740 } 741 } 742 } else { 743 // trail character is 3400..10FFFF 744 // result entry has 3 units 745 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 746 (((trail>>COMP_1_TRAIL_SHIFT))& 747 ~COMP_1_TRIPLE)); 748 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 749 uint16_t secondUnit; 750 for(;;) { 751 if(key1>(firstUnit=*list)) { 752 list+=2+(firstUnit&COMP_1_TRIPLE); 753 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 754 if(key2>(secondUnit=list[1])) { 755 if(firstUnit&COMP_1_LAST_TUPLE) { 756 break; 757 } else { 758 list+=3; 759 } 760 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 761 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 762 } else { 763 break; 764 } 765 } else { 766 break; 767 } 768 } 769 } 770 return -1; 771} 772 773/** 774 * @param list some character's compositions list 775 * @param set recursively receives the composites from these compositions 776 */ 777void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 778 uint16_t firstUnit; 779 int32_t compositeAndFwd; 780 do { 781 firstUnit=*list; 782 if((firstUnit&COMP_1_TRIPLE)==0) { 783 compositeAndFwd=list[1]; 784 list+=2; 785 } else { 786 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; 787 list+=3; 788 } 789 UChar32 composite=compositeAndFwd>>1; 790 if((compositeAndFwd&1)!=0) { 791 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 792 } 793 set.add(composite); 794 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 795} 796 797/* 798 * Recomposes the buffer text starting at recomposeStartIndex 799 * (which is in NFD - decomposed and canonically ordered), 800 * and truncates the buffer contents. 801 * 802 * Note that recomposition never lengthens the text: 803 * Any character consists of either one or two code units; 804 * a composition may contain at most one more code unit than the original starter, 805 * while the combining mark that is removed has at least one code unit. 806 */ 807void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 808 UBool onlyContiguous) const { 809 UChar *p=buffer.getStart()+recomposeStartIndex; 810 UChar *limit=buffer.getLimit(); 811 if(p==limit) { 812 return; 813 } 814 815 UChar *starter, *pRemove, *q, *r; 816 const uint16_t *compositionsList; 817 UChar32 c, compositeAndFwd; 818 uint16_t norm16; 819 uint8_t cc, prevCC; 820 UBool starterIsSupplementary; 821 822 // Some of the following variables are not used until we have a forward-combining starter 823 // and are only initialized now to avoid compiler warnings. 824 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter 825 starter=NULL; 826 starterIsSupplementary=FALSE; 827 prevCC=0; 828 829 for(;;) { 830 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); 831 cc=getCCFromYesOrMaybe(norm16); 832 if( // this character combines backward and 833 isMaybe(norm16) && 834 // we have seen a starter that combines forward and 835 compositionsList!=NULL && 836 // the backward-combining character is not blocked 837 (prevCC<cc || prevCC==0) 838 ) { 839 if(isJamoVT(norm16)) { 840 // c is a Jamo V/T, see if we can compose it with the previous character. 841 if(c<Hangul::JAMO_T_BASE) { 842 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 843 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); 844 if(prev<Hangul::JAMO_L_COUNT) { 845 pRemove=p-1; 846 UChar syllable=(UChar) 847 (Hangul::HANGUL_BASE+ 848 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 849 Hangul::JAMO_T_COUNT); 850 UChar t; 851 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 852 ++p; 853 syllable+=t; // The next character was a Jamo T. 854 } 855 *starter=syllable; 856 // remove the Jamo V/T 857 q=pRemove; 858 r=p; 859 while(r<limit) { 860 *q++=*r++; 861 } 862 limit=q; 863 p=pRemove; 864 } 865 } 866 /* 867 * No "else" for Jamo T: 868 * Since the input is in NFD, there are no Hangul LV syllables that 869 * a Jamo T could combine with. 870 * All Jamo Ts are combined above when handling Jamo Vs. 871 */ 872 if(p==limit) { 873 break; 874 } 875 compositionsList=NULL; 876 continue; 877 } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 878 // The starter and the combining mark (c) do combine. 879 UChar32 composite=compositeAndFwd>>1; 880 881 // Replace the starter with the composite, remove the combining mark. 882 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 883 if(starterIsSupplementary) { 884 if(U_IS_SUPPLEMENTARY(composite)) { 885 // both are supplementary 886 starter[0]=U16_LEAD(composite); 887 starter[1]=U16_TRAIL(composite); 888 } else { 889 *starter=(UChar)composite; 890 // The composite is shorter than the starter, 891 // move the intermediate characters forward one. 892 starterIsSupplementary=FALSE; 893 q=starter+1; 894 r=q+1; 895 while(r<pRemove) { 896 *q++=*r++; 897 } 898 --pRemove; 899 } 900 } else if(U_IS_SUPPLEMENTARY(composite)) { 901 // The composite is longer than the starter, 902 // move the intermediate characters back one. 903 starterIsSupplementary=TRUE; 904 ++starter; // temporarily increment for the loop boundary 905 q=pRemove; 906 r=++pRemove; 907 while(starter<q) { 908 *--r=*--q; 909 } 910 *starter=U16_TRAIL(composite); 911 *--starter=U16_LEAD(composite); // undo the temporary increment 912 } else { 913 // both are on the BMP 914 *starter=(UChar)composite; 915 } 916 917 /* remove the combining mark by moving the following text over it */ 918 if(pRemove<p) { 919 q=pRemove; 920 r=p; 921 while(r<limit) { 922 *q++=*r++; 923 } 924 limit=q; 925 p=pRemove; 926 } 927 // Keep prevCC because we removed the combining mark. 928 929 if(p==limit) { 930 break; 931 } 932 // Is the composite a starter that combines forward? 933 if(compositeAndFwd&1) { 934 compositionsList= 935 getCompositionsListForComposite(getNorm16(composite)); 936 } else { 937 compositionsList=NULL; 938 } 939 940 // We combined; continue with looking for compositions. 941 continue; 942 } 943 } 944 945 // no combination this time 946 prevCC=cc; 947 if(p==limit) { 948 break; 949 } 950 951 // If c did not combine, then check if it is a starter. 952 if(cc==0) { 953 // Found a new starter. 954 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { 955 // It may combine with something, prepare for it. 956 if(U_IS_BMP(c)) { 957 starterIsSupplementary=FALSE; 958 starter=p-1; 959 } else { 960 starterIsSupplementary=TRUE; 961 starter=p-2; 962 } 963 } 964 } else if(onlyContiguous) { 965 // FCC: no discontiguous compositions; any intervening character blocks. 966 compositionsList=NULL; 967 } 968 } 969 buffer.setReorderingLimit(limit); 970} 971 972UChar32 973Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { 974 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 975 const uint16_t *list; 976 if(isInert(norm16)) { 977 return U_SENTINEL; 978 } else if(norm16<minYesNoMappingsOnly) { 979 if(isJamoL(norm16)) { 980 b-=Hangul::JAMO_V_BASE; 981 if(0<=b && b<Hangul::JAMO_V_COUNT) { 982 return 983 (Hangul::HANGUL_BASE+ 984 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)* 985 Hangul::JAMO_T_COUNT); 986 } else { 987 return U_SENTINEL; 988 } 989 } else if(isHangul(norm16)) { 990 b-=Hangul::JAMO_T_BASE; 991 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0! 992 return a+b; 993 } else { 994 return U_SENTINEL; 995 } 996 } else { 997 // 'a' has a compositions list in extraData 998 list=extraData+norm16; 999 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 1000 list+= // mapping pointer 1001 1+ // +1 to skip the first unit with the mapping lenth 1002 (*list&MAPPING_LENGTH_MASK); // + mapping length 1003 } 1004 } 1005 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 1006 return U_SENTINEL; 1007 } else { 1008 list=maybeYesCompositions+norm16-minMaybeYes; 1009 } 1010 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 1011 return U_SENTINEL; 1012 } 1013#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1014 return combine(list, b)>>1; 1015#else 1016 int32_t compositeAndFwd=combine(list, b); 1017 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; 1018#endif 1019} 1020 1021// Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1022// doCompose: normalize 1023// !doCompose: isNormalized (buffer must be empty and initialized) 1024UBool 1025Normalizer2Impl::compose(const UChar *src, const UChar *limit, 1026 UBool onlyContiguous, 1027 UBool doCompose, 1028 ReorderingBuffer &buffer, 1029 UErrorCode &errorCode) const { 1030 /* 1031 * prevBoundary points to the last character before the current one 1032 * that has a composition boundary before it with ccc==0 and quick check "yes". 1033 * Keeping track of prevBoundary saves us looking for a composition boundary 1034 * when we find a "no" or "maybe". 1035 * 1036 * When we back out from prevSrc back to prevBoundary, 1037 * then we also remove those same characters (which had been simply copied 1038 * or canonically-order-inserted) from the ReorderingBuffer. 1039 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 1040 * must correspond 1:1 to destination units at the end of the destination buffer. 1041 */ 1042 const UChar *prevBoundary=src; 1043 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1044 if(limit==NULL) { 1045 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 1046 doCompose ? &buffer : NULL, 1047 errorCode); 1048 if(U_FAILURE(errorCode)) { 1049 return FALSE; 1050 } 1051 if(prevBoundary<src) { 1052 // Set prevBoundary to the last character in the prefix. 1053 prevBoundary=src-1; 1054 } 1055 limit=u_strchr(src, 0); 1056 } 1057 1058 const UChar *prevSrc; 1059 UChar32 c=0; 1060 uint16_t norm16=0; 1061 1062 // only for isNormalized 1063 uint8_t prevCC=0; 1064 1065 for(;;) { 1066 // count code units below the minimum or with irrelevant data for the quick check 1067 for(prevSrc=src; src!=limit;) { 1068 if( (c=*src)<minNoMaybeCP || 1069 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1070 ) { 1071 ++src; 1072 } else if(!U16_IS_SURROGATE(c)) { 1073 break; 1074 } else { 1075 UChar c2; 1076 if(U16_IS_SURROGATE_LEAD(c)) { 1077 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1078 c=U16_GET_SUPPLEMENTARY(c, c2); 1079 } 1080 } else /* trail surrogate */ { 1081 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1082 --src; 1083 c=U16_GET_SUPPLEMENTARY(c2, c); 1084 } 1085 } 1086 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1087 src+=U16_LENGTH(c); 1088 } else { 1089 break; 1090 } 1091 } 1092 } 1093 // copy these code units all at once 1094 if(src!=prevSrc) { 1095 if(doCompose) { 1096 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { 1097 break; 1098 } 1099 } else { 1100 prevCC=0; 1101 } 1102 if(src==limit) { 1103 break; 1104 } 1105 // Set prevBoundary to the last character in the quick check loop. 1106 prevBoundary=src-1; 1107 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1108 U16_IS_LEAD(*(prevBoundary-1)) 1109 ) { 1110 --prevBoundary; 1111 } 1112 // The start of the current character (c). 1113 prevSrc=src; 1114 } else if(src==limit) { 1115 break; 1116 } 1117 1118 src+=U16_LENGTH(c); 1119 /* 1120 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1121 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1122 * or has ccc!=0. 1123 * Check for Jamo V/T, then for regular characters. 1124 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1125 */ 1126 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1127 UChar prev=*(prevSrc-1); 1128 UBool needToDecompose=FALSE; 1129 if(c<Hangul::JAMO_T_BASE) { 1130 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1131 prev=(UChar)(prev-Hangul::JAMO_L_BASE); 1132 if(prev<Hangul::JAMO_L_COUNT) { 1133 if(!doCompose) { 1134 return FALSE; 1135 } 1136 UChar syllable=(UChar) 1137 (Hangul::HANGUL_BASE+ 1138 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 1139 Hangul::JAMO_T_COUNT); 1140 UChar t; 1141 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 1142 ++src; 1143 syllable+=t; // The next character was a Jamo T. 1144 prevBoundary=src; 1145 buffer.setLastChar(syllable); 1146 continue; 1147 } 1148 // If we see L+V+x where x!=T then we drop to the slow path, 1149 // decompose and recompose. 1150 // This is to deal with NFKC finding normal L and V but a 1151 // compatibility variant of a T. We need to either fully compose that 1152 // combination here (which would complicate the code and may not work 1153 // with strange custom data) or use the slow path -- or else our replacing 1154 // two input characters (L+V) with one output character (LV syllable) 1155 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1156 // length as what we appended to the buffer since prevBoundary. 1157 needToDecompose=TRUE; 1158 } 1159 } else if(Hangul::isHangulWithoutJamoT(prev)) { 1160 // c is a Jamo Trailing consonant, 1161 // compose with previous Hangul LV that does not contain a Jamo T. 1162 if(!doCompose) { 1163 return FALSE; 1164 } 1165 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); 1166 prevBoundary=src; 1167 continue; 1168 } 1169 if(!needToDecompose) { 1170 // The Jamo V/T did not compose into a Hangul syllable. 1171 if(doCompose) { 1172 if(!buffer.appendBMP((UChar)c, 0, errorCode)) { 1173 break; 1174 } 1175 } else { 1176 prevCC=0; 1177 } 1178 continue; 1179 } 1180 } 1181 /* 1182 * Source buffer pointers: 1183 * 1184 * all done quick check current char not yet 1185 * "yes" but (c) processed 1186 * may combine 1187 * forward 1188 * [-------------[-------------[-------------[-------------[ 1189 * | | | | | 1190 * orig. src prevBoundary prevSrc src limit 1191 * 1192 * 1193 * Destination buffer pointers inside the ReorderingBuffer: 1194 * 1195 * all done might take not filled yet 1196 * characters for 1197 * reordering 1198 * [-------------[-------------[-------------[ 1199 * | | | | 1200 * start reorderStart limit | 1201 * +remainingCap.+ 1202 */ 1203 if(norm16>=MIN_YES_YES_WITH_CC) { 1204 uint8_t cc=(uint8_t)norm16; // cc!=0 1205 if( onlyContiguous && // FCC 1206 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1207 prevBoundary<prevSrc && 1208 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1209 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1210 // passed the quick check "yes && ccc==0" test. 1211 // Check whether the last character was a "yesYes" or a "yesNo". 1212 // If a "yesNo", then we get its trailing ccc from its 1213 // mapping and check for canonical order. 1214 // All other cases are ok. 1215 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1216 ) { 1217 // Fails FCD test, need to decompose and contiguously recompose. 1218 if(!doCompose) { 1219 return FALSE; 1220 } 1221 } else if(doCompose) { 1222 if(!buffer.append(c, cc, errorCode)) { 1223 break; 1224 } 1225 continue; 1226 } else if(prevCC<=cc) { 1227 prevCC=cc; 1228 continue; 1229 } else { 1230 return FALSE; 1231 } 1232 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1233 return FALSE; 1234 } 1235 1236 /* 1237 * Find appropriate boundaries around this character, 1238 * decompose the source text from between the boundaries, 1239 * and recompose it. 1240 * 1241 * We may need to remove the last few characters from the ReorderingBuffer 1242 * to account for source text that was copied or appended 1243 * but needs to take part in the recomposition. 1244 */ 1245 1246 /* 1247 * Find the last composition boundary in [prevBoundary..src[. 1248 * It is either the decomposition of the current character (at prevSrc), 1249 * or prevBoundary. 1250 */ 1251 if(hasCompBoundaryBefore(c, norm16)) { 1252 prevBoundary=prevSrc; 1253 } else if(doCompose) { 1254 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); 1255 } 1256 1257 // Find the next composition boundary in [src..limit[ - 1258 // modifies src to point to the next starter. 1259 src=(UChar *)findNextCompBoundary(src, limit); 1260 1261 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1262 int32_t recomposeStartIndex=buffer.length(); 1263 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { 1264 break; 1265 } 1266 recompose(buffer, recomposeStartIndex, onlyContiguous); 1267 if(!doCompose) { 1268 if(!buffer.equals(prevBoundary, src)) { 1269 return FALSE; 1270 } 1271 buffer.remove(); 1272 prevCC=0; 1273 } 1274 1275 // Move to the next starter. We never need to look back before this point again. 1276 prevBoundary=src; 1277 } 1278 return TRUE; 1279} 1280 1281// Very similar to compose(): Make the same changes in both places if relevant. 1282// pQCResult==NULL: spanQuickCheckYes 1283// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) 1284const UChar * 1285Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, 1286 UBool onlyContiguous, 1287 UNormalizationCheckResult *pQCResult) const { 1288 /* 1289 * prevBoundary points to the last character before the current one 1290 * that has a composition boundary before it with ccc==0 and quick check "yes". 1291 */ 1292 const UChar *prevBoundary=src; 1293 UChar32 minNoMaybeCP=minCompNoMaybeCP; 1294 if(limit==NULL) { 1295 UErrorCode errorCode=U_ZERO_ERROR; 1296 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); 1297 if(prevBoundary<src) { 1298 // Set prevBoundary to the last character in the prefix. 1299 prevBoundary=src-1; 1300 } 1301 limit=u_strchr(src, 0); 1302 } 1303 1304 const UChar *prevSrc; 1305 UChar32 c=0; 1306 uint16_t norm16=0; 1307 uint8_t prevCC=0; 1308 1309 for(;;) { 1310 // count code units below the minimum or with irrelevant data for the quick check 1311 for(prevSrc=src;;) { 1312 if(src==limit) { 1313 return src; 1314 } 1315 if( (c=*src)<minNoMaybeCP || 1316 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1317 ) { 1318 ++src; 1319 } else if(!U16_IS_SURROGATE(c)) { 1320 break; 1321 } else { 1322 UChar c2; 1323 if(U16_IS_SURROGATE_LEAD(c)) { 1324 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1325 c=U16_GET_SUPPLEMENTARY(c, c2); 1326 } 1327 } else /* trail surrogate */ { 1328 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1329 --src; 1330 c=U16_GET_SUPPLEMENTARY(c2, c); 1331 } 1332 } 1333 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1334 src+=U16_LENGTH(c); 1335 } else { 1336 break; 1337 } 1338 } 1339 } 1340 if(src!=prevSrc) { 1341 // Set prevBoundary to the last character in the quick check loop. 1342 prevBoundary=src-1; 1343 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1344 U16_IS_LEAD(*(prevBoundary-1)) 1345 ) { 1346 --prevBoundary; 1347 } 1348 prevCC=0; 1349 // The start of the current character (c). 1350 prevSrc=src; 1351 } 1352 1353 src+=U16_LENGTH(c); 1354 /* 1355 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1356 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1357 * or has ccc!=0. 1358 */ 1359 if(isMaybeOrNonZeroCC(norm16)) { 1360 uint8_t cc=getCCFromYesOrMaybe(norm16); 1361 if( onlyContiguous && // FCC 1362 cc!=0 && 1363 prevCC==0 && 1364 prevBoundary<prevSrc && 1365 // prevCC==0 && prevBoundary<prevSrc tell us that 1366 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1367 // passed the quick check "yes && ccc==0" test. 1368 // Check whether the last character was a "yesYes" or a "yesNo". 1369 // If a "yesNo", then we get its trailing ccc from its 1370 // mapping and check for canonical order. 1371 // All other cases are ok. 1372 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1373 ) { 1374 // Fails FCD test. 1375 } else if(prevCC<=cc || cc==0) { 1376 prevCC=cc; 1377 if(norm16<MIN_YES_YES_WITH_CC) { 1378 if(pQCResult!=NULL) { 1379 *pQCResult=UNORM_MAYBE; 1380 } else { 1381 return prevBoundary; 1382 } 1383 } 1384 continue; 1385 } 1386 } 1387 if(pQCResult!=NULL) { 1388 *pQCResult=UNORM_NO; 1389 } 1390 return prevBoundary; 1391 } 1392} 1393 1394void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, 1395 UBool doCompose, 1396 UBool onlyContiguous, 1397 UnicodeString &safeMiddle, 1398 ReorderingBuffer &buffer, 1399 UErrorCode &errorCode) const { 1400 if(!buffer.isEmpty()) { 1401 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); 1402 if(src!=firstStarterInSrc) { 1403 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 1404 buffer.getLimit()); 1405 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); 1406 UnicodeString middle(lastStarterInDest, destSuffixLength); 1407 buffer.removeSuffix(destSuffixLength); 1408 safeMiddle=middle; 1409 middle.append(src, (int32_t)(firstStarterInSrc-src)); 1410 const UChar *middleStart=middle.getBuffer(); 1411 compose(middleStart, middleStart+middle.length(), onlyContiguous, 1412 TRUE, buffer, errorCode); 1413 if(U_FAILURE(errorCode)) { 1414 return; 1415 } 1416 src=firstStarterInSrc; 1417 } 1418 } 1419 if(doCompose) { 1420 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); 1421 } else { 1422 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1423 limit=u_strchr(src, 0); 1424 } 1425 buffer.appendZeroCC(src, limit, errorCode); 1426 } 1427} 1428 1429/** 1430 * Does c have a composition boundary before it? 1431 * True if its decomposition begins with a character that has 1432 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1433 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1434 * (isCompYesAndZeroCC()) so we need not decompose. 1435 */ 1436UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 1437 for(;;) { 1438 if(isCompYesAndZeroCC(norm16)) { 1439 return TRUE; 1440 } else if(isMaybeOrNonZeroCC(norm16)) { 1441 return FALSE; 1442 } else if(isDecompNoAlgorithmic(norm16)) { 1443 c=mapAlgorithmic(c, norm16); 1444 norm16=getNorm16(c); 1445 } else { 1446 // c decomposes, get everything from the variable-length extra data 1447 const uint16_t *mapping=getMapping(norm16); 1448 uint16_t firstUnit=*mapping; 1449 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1450 return FALSE; 1451 } 1452 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) { 1453 return FALSE; // non-zero leadCC 1454 } 1455 int32_t i=1; // skip over the firstUnit 1456 UChar32 c; 1457 U16_NEXT_UNSAFE(mapping, i, c); 1458 return isCompYesAndZeroCC(getNorm16(c)); 1459 } 1460 } 1461} 1462 1463UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { 1464 for(;;) { 1465 uint16_t norm16=getNorm16(c); 1466 if(isInert(norm16)) { 1467 return TRUE; 1468 } else if(norm16<=minYesNo) { 1469 // Hangul: norm16==minYesNo 1470 // Hangul LVT has a boundary after it. 1471 // Hangul LV and non-inert yesYes characters combine forward. 1472 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); 1473 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1474 return FALSE; 1475 } else if(isDecompNoAlgorithmic(norm16)) { 1476 c=mapAlgorithmic(c, norm16); 1477 } else { 1478 // c decomposes, get everything from the variable-length extra data. 1479 // If testInert, then c must be a yesNo character which has lccc=0, 1480 // otherwise it could be a noNo. 1481 const uint16_t *mapping=getMapping(norm16); 1482 uint16_t firstUnit=*mapping; 1483 // TRUE if 1484 // not MAPPING_NO_COMP_BOUNDARY_AFTER 1485 // (which is set if 1486 // c is not deleted, and 1487 // it and its decomposition do not combine forward, and it has a starter) 1488 // and if FCC then trailCC<=1 1489 return 1490 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 1491 (!onlyContiguous || firstUnit<=0x1ff); 1492 } 1493 } 1494} 1495 1496const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { 1497 BackwardUTrie2StringIterator iter(normTrie, start, p); 1498 uint16_t norm16; 1499 do { 1500 norm16=iter.previous16(); 1501 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1502 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 1503 // but that's probably not worth the extra cost. 1504 return iter.codePointStart; 1505} 1506 1507const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { 1508 ForwardUTrie2StringIterator iter(normTrie, p, limit); 1509 uint16_t norm16; 1510 do { 1511 norm16=iter.next16(); 1512 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1513 return iter.codePointStart; 1514} 1515 1516// Note: normalizer2impl.cpp r30982 (2011-nov-27) 1517// still had getFCDTrie() which built and cached an FCD trie. 1518// That provided faster access to FCD data than getFCD16FromNormData() 1519// but required synchronization and consumed some 10kB of heap memory 1520// in any process that uses FCD (e.g., via collation). 1521// tccc180[] and smallFCD[] are intended to help with any loss of performance, 1522// at least for Latin & CJK. 1523 1524// Gets the FCD value from the regular normalization data. 1525uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { 1526 // Only loops for 1:1 algorithmic mappings. 1527 for(;;) { 1528 uint16_t norm16=getNorm16(c); 1529 if(norm16<=minYesNo) { 1530 // no decomposition or Hangul syllable, all zeros 1531 return 0; 1532 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 1533 // combining mark 1534 norm16&=0xff; 1535 return norm16|(norm16<<8); 1536 } else if(norm16>=minMaybeYes) { 1537 return 0; 1538 } else if(isDecompNoAlgorithmic(norm16)) { 1539 c=mapAlgorithmic(c, norm16); 1540 } else { 1541 // c decomposes, get everything from the variable-length extra data 1542 const uint16_t *mapping=getMapping(norm16); 1543 uint16_t firstUnit=*mapping; 1544 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1545 // A character that is deleted (maps to an empty string) must 1546 // get the worst-case lccc and tccc values because arbitrary 1547 // characters on both sides will become adjacent. 1548 return 0x1ff; 1549 } else { 1550 norm16=firstUnit>>8; // tccc 1551 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1552 norm16|=*(mapping-1)&0xff00; // lccc 1553 } 1554 return norm16; 1555 } 1556 } 1557 } 1558} 1559 1560// Dual functionality: 1561// buffer!=NULL: normalize 1562// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1563const UChar * 1564Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, 1565 ReorderingBuffer *buffer, 1566 UErrorCode &errorCode) const { 1567 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1568 // Similar to the prevBoundary in the compose() implementation. 1569 const UChar *prevBoundary=src; 1570 int32_t prevFCD16=0; 1571 if(limit==NULL) { 1572 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); 1573 if(U_FAILURE(errorCode)) { 1574 return src; 1575 } 1576 if(prevBoundary<src) { 1577 prevBoundary=src; 1578 // We know that the previous character's lccc==0. 1579 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1580 prevFCD16=getFCD16(*(src-1)); 1581 if(prevFCD16>1) { 1582 --prevBoundary; 1583 } 1584 } 1585 limit=u_strchr(src, 0); 1586 } 1587 1588 // Note: In this function we use buffer->appendZeroCC() because we track 1589 // the lead and trail combining classes here, rather than leaving it to 1590 // the ReorderingBuffer. 1591 // The exception is the call to decomposeShort() which uses the buffer 1592 // in the normal way. 1593 1594 const UChar *prevSrc; 1595 UChar32 c=0; 1596 uint16_t fcd16=0; 1597 1598 for(;;) { 1599 // count code units with lccc==0 1600 for(prevSrc=src; src!=limit;) { 1601 if((c=*src)<MIN_CCC_LCCC_CP) { 1602 prevFCD16=~c; 1603 ++src; 1604 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1605 prevFCD16=0; 1606 ++src; 1607 } else { 1608 if(U16_IS_SURROGATE(c)) { 1609 UChar c2; 1610 if(U16_IS_SURROGATE_LEAD(c)) { 1611 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1612 c=U16_GET_SUPPLEMENTARY(c, c2); 1613 } 1614 } else /* trail surrogate */ { 1615 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1616 --src; 1617 c=U16_GET_SUPPLEMENTARY(c2, c); 1618 } 1619 } 1620 } 1621 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1622 prevFCD16=fcd16; 1623 src+=U16_LENGTH(c); 1624 } else { 1625 break; 1626 } 1627 } 1628 } 1629 // copy these code units all at once 1630 if(src!=prevSrc) { 1631 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 1632 break; 1633 } 1634 if(src==limit) { 1635 break; 1636 } 1637 prevBoundary=src; 1638 // We know that the previous character's lccc==0. 1639 if(prevFCD16<0) { 1640 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1641 UChar32 prev=~prevFCD16; 1642 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1643 if(prevFCD16>1) { 1644 --prevBoundary; 1645 } 1646 } else { 1647 const UChar *p=src-1; 1648 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 1649 --p; 1650 // Need to fetch the previous character's FCD value because 1651 // prevFCD16 was just for the trail surrogate code point. 1652 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); 1653 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1654 } 1655 if(prevFCD16>1) { 1656 prevBoundary=p; 1657 } 1658 } 1659 // The start of the current character (c). 1660 prevSrc=src; 1661 } else if(src==limit) { 1662 break; 1663 } 1664 1665 src+=U16_LENGTH(c); 1666 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1667 // Check for proper order, and decompose locally if necessary. 1668 if((prevFCD16&0xff)<=(fcd16>>8)) { 1669 // proper order: prev tccc <= current lccc 1670 if((fcd16&0xff)<=1) { 1671 prevBoundary=src; 1672 } 1673 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { 1674 break; 1675 } 1676 prevFCD16=fcd16; 1677 continue; 1678 } else if(buffer==NULL) { 1679 return prevBoundary; // quick check "no" 1680 } else { 1681 /* 1682 * Back out the part of the source that we copied or appended 1683 * already but is now going to be decomposed. 1684 * prevSrc is set to after what was copied/appended. 1685 */ 1686 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 1687 /* 1688 * Find the part of the source that needs to be decomposed, 1689 * up to the next safe boundary. 1690 */ 1691 src=findNextFCDBoundary(src, limit); 1692 /* 1693 * The source text does not fulfill the conditions for FCD. 1694 * Decompose and reorder a limited piece of the text. 1695 */ 1696 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { 1697 break; 1698 } 1699 prevBoundary=src; 1700 prevFCD16=0; 1701 } 1702 } 1703 return src; 1704} 1705 1706void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, 1707 UBool doMakeFCD, 1708 UnicodeString &safeMiddle, 1709 ReorderingBuffer &buffer, 1710 UErrorCode &errorCode) const { 1711 if(!buffer.isEmpty()) { 1712 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 1713 if(src!=firstBoundaryInSrc) { 1714 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 1715 buffer.getLimit()); 1716 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); 1717 UnicodeString middle(lastBoundaryInDest, destSuffixLength); 1718 buffer.removeSuffix(destSuffixLength); 1719 safeMiddle=middle; 1720 middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 1721 const UChar *middleStart=middle.getBuffer(); 1722 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 1723 if(U_FAILURE(errorCode)) { 1724 return; 1725 } 1726 src=firstBoundaryInSrc; 1727 } 1728 } 1729 if(doMakeFCD) { 1730 makeFCD(src, limit, &buffer, errorCode); 1731 } else { 1732 if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1733 limit=u_strchr(src, 0); 1734 } 1735 buffer.appendZeroCC(src, limit, errorCode); 1736 } 1737} 1738 1739const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { 1740 while(start<p && previousFCD16(start, p)>0xff) {} 1741 return p; 1742} 1743 1744const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { 1745 while(p<limit) { 1746 const UChar *codePointStart=p; 1747 if(nextFCD16(p, limit)<=0xff) { 1748 return codePointStart; 1749 } 1750 } 1751 return p; 1752} 1753 1754// CanonicalIterator data -------------------------------------------------- *** 1755 1756CanonIterData::CanonIterData(UErrorCode &errorCode) : 1757 trie(utrie2_open(0, 0, &errorCode)), 1758 canonStartSets(uprv_deleteUObject, NULL, errorCode) {} 1759 1760CanonIterData::~CanonIterData() { 1761 utrie2_close(trie); 1762} 1763 1764void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 1765 uint32_t canonValue=utrie2_get32(trie, decompLead); 1766 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 1767 // origin is the first character whose decomposition starts with 1768 // the character for which we are setting the value. 1769 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode); 1770 } else { 1771 // origin is not the first character, or it is U+0000. 1772 UnicodeSet *set; 1773 if((canonValue&CANON_HAS_SET)==0) { 1774 set=new UnicodeSet; 1775 if(set==NULL) { 1776 errorCode=U_MEMORY_ALLOCATION_ERROR; 1777 return; 1778 } 1779 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); 1780 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); 1781 utrie2_set32(trie, decompLead, canonValue, &errorCode); 1782 canonStartSets.addElement(set, errorCode); 1783 if(firstOrigin!=0) { 1784 set->add(firstOrigin); 1785 } 1786 } else { 1787 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; 1788 } 1789 set->add(origin); 1790 } 1791} 1792 1793U_CDECL_BEGIN 1794 1795// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 1796// context: the Normalizer2Impl 1797static UBool U_CALLCONV 1798enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1799 UErrorCode errorCode = U_ZERO_ERROR; 1800 if (value != 0) { 1801 Normalizer2Impl *impl = (Normalizer2Impl *)context; 1802 impl->makeCanonIterDataFromNorm16( 1803 start, end, (uint16_t)value, *impl->fCanonIterData, errorCode); 1804 } 1805 return U_SUCCESS(errorCode); 1806} 1807 1808 1809 1810// UInitOnce instantiation function for CanonIterData 1811 1812static void U_CALLCONV 1813initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { 1814 U_ASSERT(impl->fCanonIterData == NULL); 1815 impl->fCanonIterData = new CanonIterData(errorCode); 1816 if (impl->fCanonIterData == NULL) { 1817 errorCode=U_MEMORY_ALLOCATION_ERROR; 1818 } 1819 if (U_SUCCESS(errorCode)) { 1820 utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl); 1821 utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode); 1822 } 1823 if (U_FAILURE(errorCode)) { 1824 delete impl->fCanonIterData; 1825 impl->fCanonIterData = NULL; 1826 } 1827} 1828 1829U_CDECL_END 1830 1831void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1832 CanonIterData &newData, 1833 UErrorCode &errorCode) const { 1834 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 1835 // Inert, or 2-way mapping (including Hangul syllable). 1836 // We do not write a canonStartSet for any yesNo character. 1837 // Composites from 2-way mappings are added at runtime from the 1838 // starter's compositions list, and the other characters in 1839 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 1840 // "maybe" characters. 1841 return; 1842 } 1843 for(UChar32 c=start; c<=end; ++c) { 1844 uint32_t oldValue=utrie2_get32(newData.trie, c); 1845 uint32_t newValue=oldValue; 1846 if(norm16>=minMaybeYes) { 1847 // not a segment starter if it occurs in a decomposition or has cc!=0 1848 newValue|=CANON_NOT_SEGMENT_STARTER; 1849 if(norm16<MIN_NORMAL_MAYBE_YES) { 1850 newValue|=CANON_HAS_COMPOSITIONS; 1851 } 1852 } else if(norm16<minYesNo) { 1853 newValue|=CANON_HAS_COMPOSITIONS; 1854 } else { 1855 // c has a one-way decomposition 1856 UChar32 c2=c; 1857 uint16_t norm16_2=norm16; 1858 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 1859 c2=mapAlgorithmic(c2, norm16_2); 1860 norm16_2=getNorm16(c2); 1861 } 1862 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 1863 // c decomposes, get everything from the variable-length extra data 1864 const uint16_t *mapping=getMapping(norm16_2); 1865 uint16_t firstUnit=*mapping; 1866 int32_t length=firstUnit&MAPPING_LENGTH_MASK; 1867 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1868 if(c==c2 && (*(mapping-1)&0xff)!=0) { 1869 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 1870 } 1871 } 1872 // Skip empty mappings (no characters in the decomposition). 1873 if(length!=0) { 1874 ++mapping; // skip over the firstUnit 1875 // add c to first code point's start set 1876 int32_t i=0; 1877 U16_NEXT_UNSAFE(mapping, i, c2); 1878 newData.addToStartSet(c, c2, errorCode); 1879 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 1880 // one-way mapping. A 2-way mapping is possible here after 1881 // intermediate algorithmic mapping. 1882 if(norm16_2>=minNoNo) { 1883 while(i<length) { 1884 U16_NEXT_UNSAFE(mapping, i, c2); 1885 uint32_t c2Value=utrie2_get32(newData.trie, c2); 1886 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 1887 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER, 1888 &errorCode); 1889 } 1890 } 1891 } 1892 } 1893 } else { 1894 // c decomposed to c2 algorithmically; c has cc==0 1895 newData.addToStartSet(c, c2, errorCode); 1896 } 1897 } 1898 if(newValue!=oldValue) { 1899 utrie2_set32(newData.trie, c, newValue, &errorCode); 1900 } 1901 } 1902} 1903 1904UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 1905 // Logically const: Synchronized instantiation. 1906 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 1907 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); 1908 return U_SUCCESS(errorCode); 1909} 1910 1911int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 1912 return (int32_t)utrie2_get32(fCanonIterData->trie, c); 1913} 1914 1915const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 1916 return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; 1917} 1918 1919UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 1920 return getCanonValue(c)>=0; 1921} 1922 1923UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 1924 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 1925 if(canonValue==0) { 1926 return FALSE; 1927 } 1928 set.clear(); 1929 int32_t value=canonValue&CANON_VALUE_MASK; 1930 if((canonValue&CANON_HAS_SET)!=0) { 1931 set.addAll(getCanonStartSet(value)); 1932 } else if(value!=0) { 1933 set.add(value); 1934 } 1935 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 1936 uint16_t norm16=getNorm16(c); 1937 if(norm16==JAMO_L) { 1938 UChar32 syllable= 1939 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); 1940 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 1941 } else { 1942 addComposites(getCompositionsList(norm16), set); 1943 } 1944 } 1945 return TRUE; 1946} 1947 1948U_NAMESPACE_END 1949 1950// Normalizer2 data swapping ----------------------------------------------- *** 1951 1952U_NAMESPACE_USE 1953 1954U_CAPI int32_t U_EXPORT2 1955unorm2_swap(const UDataSwapper *ds, 1956 const void *inData, int32_t length, void *outData, 1957 UErrorCode *pErrorCode) { 1958 const UDataInfo *pInfo; 1959 int32_t headerSize; 1960 1961 const uint8_t *inBytes; 1962 uint8_t *outBytes; 1963 1964 const int32_t *inIndexes; 1965 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; 1966 1967 int32_t i, offset, nextOffset, size; 1968 1969 /* udata_swapDataHeader checks the arguments */ 1970 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 1971 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1972 return 0; 1973 } 1974 1975 /* check data format and format version */ 1976 pInfo=(const UDataInfo *)((const char *)inData+4); 1977 if(!( 1978 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 1979 pInfo->dataFormat[1]==0x72 && 1980 pInfo->dataFormat[2]==0x6d && 1981 pInfo->dataFormat[3]==0x32 && 1982 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) 1983 )) { 1984 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 1985 pInfo->dataFormat[0], pInfo->dataFormat[1], 1986 pInfo->dataFormat[2], pInfo->dataFormat[3], 1987 pInfo->formatVersion[0]); 1988 *pErrorCode=U_UNSUPPORTED_ERROR; 1989 return 0; 1990 } 1991 1992 inBytes=(const uint8_t *)inData+headerSize; 1993 outBytes=(uint8_t *)outData+headerSize; 1994 1995 inIndexes=(const int32_t *)inBytes; 1996 1997 if(length>=0) { 1998 length-=headerSize; 1999 if(length<(int32_t)sizeof(indexes)) { 2000 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 2001 length); 2002 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2003 return 0; 2004 } 2005 } 2006 2007 /* read the first few indexes */ 2008 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { 2009 indexes[i]=udata_readInt32(ds, inIndexes[i]); 2010 } 2011 2012 /* get the total length of the data */ 2013 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 2014 2015 if(length>=0) { 2016 if(length<size) { 2017 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 2018 length); 2019 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2020 return 0; 2021 } 2022 2023 /* copy the data for inaccessible bytes */ 2024 if(inBytes!=outBytes) { 2025 uprv_memcpy(outBytes, inBytes, size); 2026 } 2027 2028 offset=0; 2029 2030 /* swap the int32_t indexes[] */ 2031 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 2032 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 2033 offset=nextOffset; 2034 2035 /* swap the UTrie2 */ 2036 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 2037 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2038 offset=nextOffset; 2039 2040 /* swap the uint16_t extraData[] */ 2041 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; 2042 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 2043 offset=nextOffset; 2044 2045 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ 2046 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; 2047 offset=nextOffset; 2048 2049 U_ASSERT(offset==size); 2050 } 2051 2052 return headerSize+size; 2053} 2054 2055#endif // !UCONFIG_NO_NORMALIZATION 2056