150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* 250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho******************************************************************************* 350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Copyright (C) 2009-2014, International Business Machines 550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Corporation and others. All Rights Reserved. 650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho******************************************************************************* 850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* file name: normalizer2impl.cpp 950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* encoding: US-ASCII 1050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* tab size: 8 (not used) 1150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* indentation:4 1250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 1350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* created on: 2009nov22 1450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* created by: Markus W. Scherer 1550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*/ 1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/utypes.h" 1850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_NORMALIZATION 2050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/normalizer2.h" 2250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/udata.h" 2350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/ustring.h" 24103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf16.h" 2550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "cmemory.h" 2650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "mutex.h" 2750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "normalizer2impl.h" 28103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "putilimp.h" 2950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uassert.h" 3050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uset_imp.h" 3150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "utrie2.h" 3227f654740f2a26ad62a5c155af9199af9e69b889claireho#include "uvector.h" 3350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 3450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_BEGIN 3550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 3650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// ReorderingBuffer -------------------------------------------------------- *** 3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 3850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 3950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length=str.length(); 4050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start=str.getBuffer(destCapacity); 4150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start==NULL) { 4250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // getBuffer() already did str.setToBogus() 4350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho errorCode=U_MEMORY_ALLOCATION_ERROR; 4450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 4550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 4650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit=start+length; 4750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity=str.getCapacity()-length; 4850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=start; 4950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start==limit) { 5050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=0; 5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 5250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho setIterator(); 5350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=previousCC(); 5450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Set reorderStart after the last code point with cc<=1 if there is one. 5550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(lastCC>1) { 5650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(previousCC()>1) {} 5750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 5850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=codePointLimit; 5950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 6050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 6150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 6250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 6350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { 6450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length=(int32_t)(limit-start); 6550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho length==(int32_t)(otherLimit-otherStart) && 6750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 0==u_memcmp(start, otherStart, length); 6850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 6950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 7050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 7150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(remainingCapacity<2 && !resize(2, errorCode)) { 7250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 7350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 7450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(lastCC<=cc || cc==0) { 7550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit[0]=U16_LEAD(c); 7650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit[1]=U16_TRAIL(c); 7750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit+=2; 7850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=cc; 7950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cc<=1) { 8050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit; 8150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 8250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 8350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho insert(c, cc); 8450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 8550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity-=2; 8650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 8850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 8950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool ReorderingBuffer::append(const UChar *s, int32_t length, 9050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t leadCC, uint8_t trailCC, 9150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) { 9250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(length==0) { 9350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 9450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 9550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(remainingCapacity<length && !resize(length, errorCode)) { 9650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 9750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 9850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity-=length; 9950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(lastCC<=leadCC || leadCC==0) { 10050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(trailCC<=1) { 10150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit+length; 10250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(leadCC<=1) { 10350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit+1; // Ok if not a code point boundary. 10450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 10550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *sLimit=s+length; 10650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { *limit++=*s++; } while(s!=sLimit); 10750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=trailCC; 10850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 10950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t i=0; 11050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c; 11150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(s, i, length, c); 11250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho insert(c, leadCC); // insert first code point 11350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(i<length) { 11450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(s, i, length, c); 11550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(i<length) { 11650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // s must be in NFD, otherwise we need to use getCC(). 11750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 11850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 11950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho leadCC=trailCC; 12050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 12150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho append(c, leadCC, errorCode); 12250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 12450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 12550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 12750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 12850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t cpLength=U16_LENGTH(c); 12950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 13050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 13150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 13250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity-=cpLength; 13350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cpLength==1) { 13450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *limit++=(UChar)c; 13550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 13650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit[0]=U16_LEAD(c); 13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit[1]=U16_TRAIL(c); 13850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit+=2; 13950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 14050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=0; 14150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit; 14250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 14450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 14550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { 14650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(s==sLimit) { 14750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 14850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 14950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length=(int32_t)(sLimit-s); 15050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(remainingCapacity<length && !resize(length, errorCode)) { 15150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 15250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 15350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho u_memcpy(limit, s, length); 15450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit+=length; 15550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity-=length; 15650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=0; 15750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit; 15850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 15950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 16050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 16150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid ReorderingBuffer::remove() { 16250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit=start; 16350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity=str.getCapacity(); 16450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=0; 16550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 16650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 16750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid ReorderingBuffer::removeSuffix(int32_t suffixLength) { 16850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(suffixLength<(limit-start)) { 16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit-=suffixLength; 17050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity+=suffixLength; 17150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 17250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit=start; 17350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity=str.getCapacity(); 17450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 17550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=0; 17650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit; 17750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 17850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 17950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 18050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t reorderStartIndex=(int32_t)(reorderStart-start); 18150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length=(int32_t)(limit-start); 18250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho str.releaseBuffer(length); 18350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t newCapacity=length+appendLength; 18450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t doubleCapacity=2*str.getCapacity(); 18550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(newCapacity<doubleCapacity) { 18650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho newCapacity=doubleCapacity; 18750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 18850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(newCapacity<256) { 18950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho newCapacity=256; 19050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 19150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start=str.getBuffer(newCapacity); 19250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start==NULL) { 19350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // getBuffer() already did str.setToBogus() 19450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho errorCode=U_MEMORY_ALLOCATION_ERROR; 19550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 19650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 19750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=start+reorderStartIndex; 19850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit=start+length; 19950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity=str.getCapacity()-length; 20050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 20150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 20250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 20350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid ReorderingBuffer::skipPrevious() { 20450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho codePointLimit=codePointStart; 20550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar c=*--codePointStart; 20650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 20750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --codePointStart; 20850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 20950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 21050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 21150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehouint8_t ReorderingBuffer::previousCC() { 21250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho codePointLimit=codePointStart; 21350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(reorderStart>=codePointStart) { 21450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 21550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 21650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c=*--codePointStart; 21750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { 21850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 21950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 22050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 22150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar c2; 22250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 22350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --codePointStart; 22450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=U16_GET_SUPPLEMENTARY(c2, c); 22550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 22650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 22750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 22850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 22950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Inserts c somewhere before the last character. 23050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Requires 0<cc<lastCC which implies reorderStart<limit. 23150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 23250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(setIterator(), skipPrevious(); previousCC()>cc;) {} 23350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // insert c at codePointLimit, after the character with prevCC<=cc 23450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *q=limit; 23550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *r=limit+=U16_LENGTH(c); 23650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { 23750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *--r=*--q; 23850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } while(codePointLimit!=q); 23950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho writeCodePoint(q, c); 24050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cc<=1) { 24150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=r; 24250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 24350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 24450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 24550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Normalizer2Impl --------------------------------------------------------- *** 24650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 24727f654740f2a26ad62a5c155af9199af9e69b889clairehostruct CanonIterData : public UMemory { 24827f654740f2a26ad62a5c155af9199af9e69b889claireho CanonIterData(UErrorCode &errorCode); 24927f654740f2a26ad62a5c155af9199af9e69b889claireho ~CanonIterData(); 25027f654740f2a26ad62a5c155af9199af9e69b889claireho void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 25127f654740f2a26ad62a5c155af9199af9e69b889claireho UTrie2 *trie; 25227f654740f2a26ad62a5c155af9199af9e69b889claireho UVector canonStartSets; // contains UnicodeSet * 25327f654740f2a26ad62a5c155af9199af9e69b889claireho}; 25427f654740f2a26ad62a5c155af9199af9e69b889claireho 25550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::~Normalizer2Impl() { 25650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho udata_close(memory); 25750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_close(normTrie); 25859d709d503bab6e2b61931737e662dd293b40578ccornelius delete fCanonIterData; 25950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 26050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 26150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool U_CALLCONV 26250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::isAcceptable(void *context, 26327f654740f2a26ad62a5c155af9199af9e69b889claireho const char * /* type */, const char * /*name*/, 26450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UDataInfo *pInfo) { 26550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( 26650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->size>=20 && 26750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->isBigEndian==U_IS_BIG_ENDIAN && 26850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->charsetFamily==U_CHARSET_FAMILY && 26950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 27050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[1]==0x72 && 27150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[2]==0x6d && 27250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[3]==0x32 && 273103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius pInfo->formatVersion[0]==2 27450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 27550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Normalizer2Impl *me=(Normalizer2Impl *)context; 27650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); 27750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 27850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 27950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 28050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 28150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 28250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 28350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid 28450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { 28550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 28650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 28750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 28850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); 28950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 29050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 29150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 29250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); 29350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const int32_t *inIndexes=(const int32_t *)inBytes; 29450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; 29550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(indexesLength<=IX_MIN_MAYBE_YES) { 29650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. 29750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 29850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 29950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 30050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 30150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 30250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 30350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho minYesNo=inIndexes[IX_MIN_YES_NO]; 304103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 30550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho minNoNo=inIndexes[IX_MIN_NO_NO]; 30650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 30750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 30850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 30950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; 31050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 31150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 31250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho inBytes+offset, nextOffset-offset, NULL, 31350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho &errorCode); 31450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 31550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 31650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 31750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 31850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offset=nextOffset; 319103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 32050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho maybeYesCompositions=(const uint16_t *)(inBytes+offset); 32150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); 322103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 323103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // smallFCD: new in formatVersion 2 324103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius offset=nextOffset; 325103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius smallFCD=inBytes+offset; 326103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 327103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Build tccc180[]. 328103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 329103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint8_t bits=0; 330103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius for(UChar c=0; c<0x180; bits>>=1) { 331103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if((c&0xff)==0) { 332103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius bits=smallFCD[c>>8]; // one byte per 0x100 code points 333103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 334103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(bits&1) { 335103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius for(int i=0; i<0x20; ++i, ++c) { 336103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius tccc180[c]=(uint8_t)getFCD16FromNormData(c); 337103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 338103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 339103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uprv_memset(tccc180+c, 0, 0x20); 340103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius c+=0x20; 341103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 342103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 34350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 34450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 34550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehouint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { 34650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c; 34750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cpStart==(cpLimit-1)) { 34850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=*cpStart; 34950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 35050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); 35150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 35250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t prevNorm16=getNorm16(c); 35350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prevNorm16<=minYesNo) { 35450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 35550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 35650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo 35750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 35850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 35950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace { 361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass LcccContext { 363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {} 365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void handleRange(UChar32 start, UChar32 end, uint16_t norm16) { 367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(impl.isAlgorithmicNoNo(norm16)) { 368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Range of code points with same-norm16-value algorithmic decompositions. 369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // They might have different non-zero FCD16 values. 370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius do { 371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint16_t fcd16=impl.getFCD16(start); 372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fcd16>0xff) { set.add(start); } 373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } while(++start<=end); 374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint16_t fcd16=impl.getFCD16(start); 376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fcd16>0xff) { set.add(start, end); } 377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate: 381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const Normalizer2Impl &impl; 382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet &set; 383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstruct PropertyStartsContext { 386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder) 387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : impl(ni), sa(adder) {} 388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const Normalizer2Impl &impl; 390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const USetAdder *sa; 391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 392fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 393fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} // namespace 394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 39550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_BEGIN 39650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 39750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UBool U_CALLCONV 398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusenumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ((LcccContext *)context)->handleRange(start, end, (uint16_t)value); 400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic UBool U_CALLCONV 404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusenumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /* add the start code point to the USet */ 406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const PropertyStartsContext *ctx=(const PropertyStartsContext *)context; 407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const USetAdder *sa=ctx->sa; 408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sa->add(sa->set, start); 409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) { 410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Range of code points with same-norm16-value algorithmic decompositions. 411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // They might have different non-zero FCD16 values. 412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint16_t prevFCD16=ctx->impl.getFCD16(start); 413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(++start<=end) { 414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint16_t fcd16=ctx->impl.getFCD16(start); 415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fcd16!=prevFCD16) { 416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sa->add(sa->set, start); 417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius prevFCD16=fcd16; 418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic UBool U_CALLCONV 42550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoenumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 42650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* add the start code point to the USet */ 42750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const USetAdder *sa=(const USetAdder *)context; 42850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho sa->add(sa->set, start); 42950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 43050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 43150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 43227f654740f2a26ad62a5c155af9199af9e69b889clairehostatic uint32_t U_CALLCONV 43327f654740f2a26ad62a5c155af9199af9e69b889clairehosegmentStarterMapper(const void * /*context*/, uint32_t value) { 43427f654740f2a26ad62a5c155af9199af9e69b889claireho return value&CANON_NOT_SEGMENT_STARTER; 43527f654740f2a26ad62a5c155af9199af9e69b889claireho} 43627f654740f2a26ad62a5c155af9199af9e69b889claireho 43750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_END 43850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 43950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid 440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusNormalizer2Impl::addLcccChars(UnicodeSet &set) const { 441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /* add the start code point of each same-value range of each trie */ 442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LcccContext context(*this, set); 443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_enum(normTrie, NULL, enumLcccRange, &context); 444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 445fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 446fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 44727f654740f2a26ad62a5c155af9199af9e69b889clairehoNormalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 44850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* add the start code point of each same-value range of each trie */ 449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius PropertyStartsContext context(*this, sa); 450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context); 45150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 45250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* add Hangul LV syllables and LV+1 because of skippables */ 45350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 45450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho sa->add(sa->set, c); 45550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho sa->add(sa->set, c+1); 45650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 45750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 45850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 45950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 46027f654740f2a26ad62a5c155af9199af9e69b889clairehovoid 46127f654740f2a26ad62a5c155af9199af9e69b889clairehoNormalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 46227f654740f2a26ad62a5c155af9199af9e69b889claireho /* add the start code point of each same-value range of the canonical iterator data trie */ 46327f654740f2a26ad62a5c155af9199af9e69b889claireho if(ensureCanonIterData(errorCode)) { 46427f654740f2a26ad62a5c155af9199af9e69b889claireho // currently only used for the SEGMENT_STARTER property 46559d709d503bab6e2b61931737e662dd293b40578ccornelius utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa); 46627f654740f2a26ad62a5c155af9199af9e69b889claireho } 46727f654740f2a26ad62a5c155af9199af9e69b889claireho} 46827f654740f2a26ad62a5c155af9199af9e69b889claireho 46950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar * 47050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, 47150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 minNeedDataCP, 47250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer *buffer, 47350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 47450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Make some effort to support NUL-terminated strings reasonably. 47550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Take the part of the fast quick check loop that does not look up 47650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // data and check the first part of the string. 47750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // After this prefix, determine the string length to simplify the rest 47850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // of the code. 47950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *prevSrc=src; 48050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar c; 48150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while((c=*src++)<minNeedDataCP && c!=0) {} 48250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Back out the last character for full processing. 48350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Copy this prefix. 48450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(--src!=prevSrc) { 48550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(buffer!=NULL) { 48650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer->appendZeroCC(prevSrc, src, errorCode); 48750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 48850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 48950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return src; 49050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 49150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUnicodeString & 493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusNormalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest, 494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) const { 495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dest.setToBogus(); 497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return dest; 498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UChar *sArray=src.getBuffer(); 500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(&dest==&src || sArray==NULL) { 501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode=U_ILLEGAL_ARGUMENT_ERROR; 502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dest.setToBogus(); 503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return dest; 504fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 505fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decompose(sArray, sArray+src.length(), dest, src.length(), errorCode); 506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return dest; 507fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 508fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusNormalizer2Impl::decompose(const UChar *src, const UChar *limit, 511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString &dest, 512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t destLengthEstimate, 513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) const { 514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(destLengthEstimate<0 && limit!=NULL) { 515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius destLengthEstimate=(int32_t)(limit-src); 516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dest.remove(); 518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ReorderingBuffer buffer(*this, dest); 519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(buffer.init(destLengthEstimate, errorCode)) { 520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decompose(src, limit, &buffer, errorCode); 521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 52450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Dual functionality: 52550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// buffer!=NULL: normalize 52650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// buffer==NULL: isNormalized/spanQuickCheckYes 52750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar * 52850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::decompose(const UChar *src, const UChar *limit, 52950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer *buffer, 53050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 53150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 minNoCP=minDecompNoCP; 53250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(limit==NULL) { 53350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 53450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 53550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return src; 53650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 53750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit=u_strchr(src, 0); 53850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 53950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 54050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *prevSrc; 54150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c=0; 54250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16=0; 54350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 54450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // only for quick check 54550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *prevBoundary=src; 54650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t prevCC=0; 54750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 54850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 54950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // count code units below the minimum or with irrelevant data for the quick check 55050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(prevSrc=src; src!=limit;) { 55150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( (c=*src)<minNoCP || 55250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 55350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 55450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++src; 55550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(!U16_IS_SURROGATE(c)) { 55650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 55750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 55850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar c2; 55950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U16_IS_SURROGATE_LEAD(c)) { 56050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 56150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=U16_GET_SUPPLEMENTARY(c, c2); 56250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 56350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else /* trail surrogate */ { 56450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 56550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --src; 56650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=U16_GET_SUPPLEMENTARY(c2, c); 56750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 56850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 56950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 57050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src+=U16_LENGTH(c); 57150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 57250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 57350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 57450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 57550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 57650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // copy these code units all at once 57750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src!=prevSrc) { 57850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(buffer!=NULL) { 57950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 58050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 58150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 58250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 58350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=0; 58450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src; 58550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 58650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 58750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src==limit) { 58850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 58950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 59050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 59150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Check one above-minimum, relevant code point. 59250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src+=U16_LENGTH(c); 59350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(buffer!=NULL) { 59450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!decompose(c, norm16, *buffer, errorCode)) { 59550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 59650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 59750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 59850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isDecompYes(norm16)) { 59950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t cc=getCCFromYesOrMaybe(norm16); 60050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prevCC<=cc || cc==0) { 60150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=cc; 60250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cc<=1) { 60350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src; 60450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 60550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 60650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 60750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 60850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return prevBoundary; // "no" or cc out of order 60950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 61050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 61150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return src; 61250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 61350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 61450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Decompose a short piece of text which is likely to contain characters that 61550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// fail the quick check loop and/or where the quick check loop's overhead 61650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// is unlikely to be amortized. 61750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Called by the compose() and makeFCD() implementations. 61850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, 61950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 62050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 62150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(src<limit) { 62250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c; 62350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16; 62450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); 62550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!decompose(c, norm16, buffer, errorCode)) { 62650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 62750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 62850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 62950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 63050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 63150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 63250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 63350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 63450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 63550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Only loops for 1:1 algorithmic mappings. 63650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 63750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // get the decomposition and the lead and trail cc's 63850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isDecompYes(norm16)) { 63950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c does not decompose 64050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 64150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(isHangul(norm16)) { 64250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Hangul syllable: decompose algorithmically 64350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar jamos[3]; 64450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 64550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(isDecompNoAlgorithmic(norm16)) { 64650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=mapAlgorithmic(c, norm16); 64750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=getNorm16(c); 64850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 64950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c decomposes, get everything from the variable-length extra data 65050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *mapping=getMapping(norm16); 651103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t firstUnit=*mapping; 65250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length=firstUnit&MAPPING_LENGTH_MASK; 65350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t leadCC, trailCC; 65450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho trailCC=(uint8_t)(firstUnit>>8); 65550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 656103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius leadCC=(uint8_t)(*(mapping-1)>>8); 65750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 65850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho leadCC=0; 65950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 660103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); 66150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 66250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 66350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 66450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 66550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar * 66650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { 66750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *decomp=NULL; 66850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16; 66950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 67050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 67150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c does not decompose 67250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return decomp; 67350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(isHangul(norm16)) { 67450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Hangul syllable: decompose algorithmically 67550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho length=Hangul::decompose(c, buffer); 67650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return buffer; 67750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(isDecompNoAlgorithmic(norm16)) { 67850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=mapAlgorithmic(c, norm16); 67950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho decomp=buffer; 68050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho length=0; 68150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_APPEND_UNSAFE(buffer, length, c); 68250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 68350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c decomposes, get everything from the variable-length extra data 68450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *mapping=getMapping(norm16); 685103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius length=*mapping&MAPPING_LENGTH_MASK; 686103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return (const UChar *)mapping+1; 687103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 688103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 689103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius} 690103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 691103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 692103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// so that a raw mapping fits that consists of one unit ("rm0") 693103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// plus all but the first two code units of the normal mapping. 694103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. 695103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliusconst UChar * 696103e9ffba2cba345d0078eb8b8db33249f81840aCraig CorneliusNormalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const { 697103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // We do not loop in this method because an algorithmic mapping itself 698103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // becomes a final result rather than having to be decomposed recursively. 699103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t norm16; 700103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 701103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // c does not decompose 702103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return NULL; 703103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(isHangul(norm16)) { 704103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Hangul syllable: decompose algorithmically 705103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius Hangul::getRawDecomposition(c, buffer); 706103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius length=2; 707103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return buffer; 708103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(isDecompNoAlgorithmic(norm16)) { 709103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius c=mapAlgorithmic(c, norm16); 710103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius length=0; 711103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius U16_APPEND_UNSAFE(buffer, length, c); 712103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return buffer; 713103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 714103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // c decomposes, get everything from the variable-length extra data 715103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius const uint16_t *mapping=getMapping(norm16); 716103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t firstUnit=*mapping; 717103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 718103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(firstUnit&MAPPING_HAS_RAW_MAPPING) { 719103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 720103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 721103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1; 722103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t rm0=*rawMapping; 723103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(rm0<=MAPPING_LENGTH_MASK) { 724103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius length=rm0; 725103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return (const UChar *)rawMapping-rm0; 726103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 727103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Copy the normal mapping and replace its first two code units with rm0. 728103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius buffer[0]=(UChar)rm0; 729103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); 730103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius length=mLength-1; 731103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return buffer; 73250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 733103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 734103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius length=mLength; 735103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return (const UChar *)mapping+1; 73650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 73750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 73850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 73950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 74050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, 74150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool doDecompose, 742b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString &safeMiddle, 74350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 74450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 745b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho buffer.copyReorderableSuffixTo(safeMiddle); 74650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(doDecompose) { 74750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho decompose(src, limit, &buffer, errorCode); 74850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 74950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 75050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Just merge the strings at the boundary. 75150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ForwardUTrie2StringIterator iter(normTrie, src, limit); 75250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t firstCC, prevCC, cc; 75350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho firstCC=prevCC=cc=getCC(iter.next16()); 75450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(cc!=0) { 75550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=cc; 75650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho cc=getCC(iter.next16()); 75750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 758b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(limit==NULL) { // appendZeroCC() needs limit!=NULL 759b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho limit=u_strchr(iter.codePointStart, 0); 760b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 76154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 76254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { 76350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.appendZeroCC(iter.codePointStart, limit, errorCode); 76454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius } 76550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 76650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 76750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Note: hasDecompBoundary() could be implemented as aliases to 76850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 76950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// at the cost of building the FCD trie for a decomposition normalizer. 77050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { 77150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 77250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(c<minDecompNoCP) { 77350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 77450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 77550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16=getNorm16(c); 77650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 77750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 77850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(norm16>MIN_NORMAL_MAYBE_YES) { 77950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; // ccc!=0 78050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(isDecompNoAlgorithmic(norm16)) { 78150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=mapAlgorithmic(c, norm16); 78250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 78350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c decomposes, get everything from the variable-length extra data 78450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *mapping=getMapping(norm16); 785103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t firstUnit=*mapping; 78650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((firstUnit&MAPPING_LENGTH_MASK)==0) { 78750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 78850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 78950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!before) { 79050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // decomp after-boundary: same as hasFCDBoundaryAfter(), 79150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // fcd16<=1 || trailCC==0 79250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(firstUnit>0x1ff) { 79350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; // trailCC>1 79450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 79550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(firstUnit<=0xff) { 79650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; // trailCC==0 79750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 79850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // if(trailCC==1) test leadCC==0, same as checking for before-boundary 79950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 80050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TRUE if leadCC==0 (hasFCDBoundaryBefore()) 801103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 80250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 80350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 80450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 80550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 80650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* 80750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Finds the recomposition result for 80850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * a forward-combining "lead" character, 80950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * specified with a pointer to its compositions list, 81050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * and a backward-combining "trail" character. 81150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 81250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * If the lead and trail characters combine, then this function returns 81350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * the following "compositeAndFwd" value: 81450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Bits 21..1 composite character 81550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Bit 0 set if the composite is a forward-combining starter 81650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * otherwise it returns -1. 81750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 81850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The compositions list has (trail, compositeAndFwd) pair entries, 81950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * encoded as either pairs or triples of 16-bit units. 82050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The last entry has the high bit of its first unit set. 82150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 82250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The list is sorted by ascending trail characters (there are no duplicates). 82350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * A linear search is used. 82450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 82550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * See normalizer2impl.h for a more detailed description 82650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * of the compositions list format. 82750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 82850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoint32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 82950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t key1, firstUnit; 83050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(trail<COMP_1_TRAIL_LIMIT) { 83150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // trail character is 0..33FF 83250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // result entry may have 2 or 3 units 83350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho key1=(uint16_t)(trail<<1); 83450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(key1>(firstUnit=*list)) { 83550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho list+=2+(firstUnit&COMP_1_TRIPLE); 83650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 83750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 83850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(firstUnit&COMP_1_TRIPLE) { 83950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return ((int32_t)list[1]<<16)|list[2]; 84050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 84150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return list[1]; 84250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 84350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 84450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 84550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // trail character is 3400..10FFFF 84650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // result entry has 3 units 84750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 84827f654740f2a26ad62a5c155af9199af9e69b889claireho (((trail>>COMP_1_TRAIL_SHIFT))& 84927f654740f2a26ad62a5c155af9199af9e69b889claireho ~COMP_1_TRIPLE)); 85050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 85150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t secondUnit; 85250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 85350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(key1>(firstUnit=*list)) { 85450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho list+=2+(firstUnit&COMP_1_TRIPLE); 85550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 85650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(key2>(secondUnit=list[1])) { 85750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(firstUnit&COMP_1_LAST_TUPLE) { 85850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 85950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 86050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho list+=3; 86150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 86250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 86350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 86450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 86550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 86650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 86750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 86850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 86950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 87050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 87150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 87250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return -1; 87350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 87450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 87527f654740f2a26ad62a5c155af9199af9e69b889claireho/** 87627f654740f2a26ad62a5c155af9199af9e69b889claireho * @param list some character's compositions list 87727f654740f2a26ad62a5c155af9199af9e69b889claireho * @param set recursively receives the composites from these compositions 87827f654740f2a26ad62a5c155af9199af9e69b889claireho */ 87927f654740f2a26ad62a5c155af9199af9e69b889clairehovoid Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 88027f654740f2a26ad62a5c155af9199af9e69b889claireho uint16_t firstUnit; 88127f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t compositeAndFwd; 88227f654740f2a26ad62a5c155af9199af9e69b889claireho do { 88327f654740f2a26ad62a5c155af9199af9e69b889claireho firstUnit=*list; 88427f654740f2a26ad62a5c155af9199af9e69b889claireho if((firstUnit&COMP_1_TRIPLE)==0) { 88527f654740f2a26ad62a5c155af9199af9e69b889claireho compositeAndFwd=list[1]; 88627f654740f2a26ad62a5c155af9199af9e69b889claireho list+=2; 88727f654740f2a26ad62a5c155af9199af9e69b889claireho } else { 88827f654740f2a26ad62a5c155af9199af9e69b889claireho compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; 88927f654740f2a26ad62a5c155af9199af9e69b889claireho list+=3; 89027f654740f2a26ad62a5c155af9199af9e69b889claireho } 89127f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 composite=compositeAndFwd>>1; 89227f654740f2a26ad62a5c155af9199af9e69b889claireho if((compositeAndFwd&1)!=0) { 89327f654740f2a26ad62a5c155af9199af9e69b889claireho addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 89427f654740f2a26ad62a5c155af9199af9e69b889claireho } 89527f654740f2a26ad62a5c155af9199af9e69b889claireho set.add(composite); 89627f654740f2a26ad62a5c155af9199af9e69b889claireho } while((firstUnit&COMP_1_LAST_TUPLE)==0); 89727f654740f2a26ad62a5c155af9199af9e69b889claireho} 89827f654740f2a26ad62a5c155af9199af9e69b889claireho 89950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* 90050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Recomposes the buffer text starting at recomposeStartIndex 90150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * (which is in NFD - decomposed and canonically ordered), 90250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * and truncates the buffer contents. 90350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 90450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Note that recomposition never lengthens the text: 90550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Any character consists of either one or two code units; 90650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * a composition may contain at most one more code unit than the original starter, 90750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * while the combining mark that is removed has at least one code unit. 90850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 90950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 91050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool onlyContiguous) const { 91150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *p=buffer.getStart()+recomposeStartIndex; 91250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *limit=buffer.getLimit(); 91350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p==limit) { 91450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 91550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 91650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 91750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *starter, *pRemove, *q, *r; 91850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *compositionsList; 91950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c, compositeAndFwd; 92050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16; 92150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t cc, prevCC; 92250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool starterIsSupplementary; 92350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 92450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Some of the following variables are not used until we have a forward-combining starter 92550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // and are only initialized now to avoid compiler warnings. 92650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compositionsList=NULL; // used as indicator for whether we have a forward-combining starter 92750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starter=NULL; 92850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starterIsSupplementary=FALSE; 92950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=0; 93050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 93150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 93250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); 93350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho cc=getCCFromYesOrMaybe(norm16); 93450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( // this character combines backward and 93550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isMaybe(norm16) && 93650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // we have seen a starter that combines forward and 93750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compositionsList!=NULL && 93850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // the backward-combining character is not blocked 93950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (prevCC<cc || prevCC==0) 94050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 94150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isJamoVT(norm16)) { 94250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c is a Jamo V/T, see if we can compose it with the previous character. 94350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(c<Hangul::JAMO_T_BASE) { 94450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 94550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); 94650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prev<Hangul::JAMO_L_COUNT) { 94750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pRemove=p-1; 94850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar syllable=(UChar) 94950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (Hangul::HANGUL_BASE+ 95050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 95150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Hangul::JAMO_T_COUNT); 95250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar t; 95350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 95450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++p; 95550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho syllable+=t; // The next character was a Jamo T. 95650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 95750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *starter=syllable; 95850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // remove the Jamo V/T 95950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho q=pRemove; 96050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho r=p; 96150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(r<limit) { 96250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *q++=*r++; 96350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 96450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit=q; 96550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p=pRemove; 96650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 96750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 96850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 96950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * No "else" for Jamo T: 97050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Since the input is in NFD, there are no Hangul LV syllables that 97150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * a Jamo T could combine with. 97250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * All Jamo Ts are combined above when handling Jamo Vs. 97350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 97450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p==limit) { 97550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 97650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 97750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compositionsList=NULL; 97850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 97950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 98050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The starter and the combining mark (c) do combine. 98150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 composite=compositeAndFwd>>1; 98250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 98350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Replace the starter with the composite, remove the combining mark. 98450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 98550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(starterIsSupplementary) { 98650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_IS_SUPPLEMENTARY(composite)) { 98750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // both are supplementary 98850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starter[0]=U16_LEAD(composite); 98950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starter[1]=U16_TRAIL(composite); 99050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 99150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *starter=(UChar)composite; 99250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The composite is shorter than the starter, 99350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // move the intermediate characters forward one. 99450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starterIsSupplementary=FALSE; 99550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho q=starter+1; 99650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho r=q+1; 99750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(r<pRemove) { 99850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *q++=*r++; 99950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 100050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --pRemove; 100150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 100250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(U_IS_SUPPLEMENTARY(composite)) { 100350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The composite is longer than the starter, 100450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // move the intermediate characters back one. 100550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starterIsSupplementary=TRUE; 100650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++starter; // temporarily increment for the loop boundary 100750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho q=pRemove; 100850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho r=++pRemove; 100950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(starter<q) { 101050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *--r=*--q; 101150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 101250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *starter=U16_TRAIL(composite); 101350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *--starter=U16_LEAD(composite); // undo the temporary increment 101450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 101550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // both are on the BMP 101650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *starter=(UChar)composite; 101750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 101850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 101950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* remove the combining mark by moving the following text over it */ 102050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(pRemove<p) { 102150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho q=pRemove; 102250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho r=p; 102350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(r<limit) { 102450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *q++=*r++; 102550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 102650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit=q; 102750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p=pRemove; 102850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 102950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Keep prevCC because we removed the combining mark. 103050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 103150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p==limit) { 103250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 103350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 103450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Is the composite a starter that combines forward? 103550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(compositeAndFwd&1) { 103650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compositionsList= 103750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho getCompositionsListForComposite(getNorm16(composite)); 103850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 103950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compositionsList=NULL; 104050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 104150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 104250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // We combined; continue with looking for compositions. 104350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 104450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 104550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 104650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 104750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // no combination this time 104850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=cc; 104950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p==limit) { 105050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 105150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 105250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 105350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If c did not combine, then check if it is a starter. 105450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cc==0) { 105550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Found a new starter. 105650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { 105750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // It may combine with something, prepare for it. 105850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_IS_BMP(c)) { 105950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starterIsSupplementary=FALSE; 106050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starter=p-1; 106150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 106250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starterIsSupplementary=TRUE; 106350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starter=p-2; 106450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 106550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 106650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(onlyContiguous) { 106750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // FCC: no discontiguous compositions; any intervening character blocks. 106850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compositionsList=NULL; 106950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 107050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 107150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.setReorderingLimit(limit); 107250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 107350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1074103e9ffba2cba345d0078eb8b8db33249f81840aCraig CorneliusUChar32 1075103e9ffba2cba345d0078eb8b8db33249f81840aCraig CorneliusNormalizer2Impl::composePair(UChar32 a, UChar32 b) const { 1076103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 1077103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius const uint16_t *list; 1078103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(isInert(norm16)) { 1079103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return U_SENTINEL; 1080103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(norm16<minYesNoMappingsOnly) { 1081103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(isJamoL(norm16)) { 1082103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius b-=Hangul::JAMO_V_BASE; 1083103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(0<=b && b<Hangul::JAMO_V_COUNT) { 1084103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return 1085103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (Hangul::HANGUL_BASE+ 1086103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)* 1087103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius Hangul::JAMO_T_COUNT); 1088103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 1089103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return U_SENTINEL; 1090103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1091103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(isHangul(norm16)) { 1092103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius b-=Hangul::JAMO_T_BASE; 1093103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0! 1094103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return a+b; 1095103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 1096103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return U_SENTINEL; 1097103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1098103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 1099103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // 'a' has a compositions list in extraData 1100103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius list=extraData+norm16; 1101103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 1102103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius list+= // mapping pointer 1103103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 1+ // +1 to skip the first unit with the mapping lenth 1104103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (*list&MAPPING_LENGTH_MASK); // + mapping length 1105103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1106103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1107103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 1108103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return U_SENTINEL; 1109103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 1110103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius list=maybeYesCompositions+norm16-minMaybeYes; 1111103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1112103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 1113103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return U_SENTINEL; 1114103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1115103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1116103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return combine(list, b)>>1; 1117103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#else 1118103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t compositeAndFwd=combine(list, b); 1119103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; 1120103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#endif 1121103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius} 1122103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 112350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 112450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// doCompose: normalize 112550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// !doCompose: isNormalized (buffer must be empty and initialized) 112650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool 112750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::compose(const UChar *src, const UChar *limit, 112850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool onlyContiguous, 112950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool doCompose, 113050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 113150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 113250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 113350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * prevBoundary points to the last character before the current one 113450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * that has a composition boundary before it with ccc==0 and quick check "yes". 113550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Keeping track of prevBoundary saves us looking for a composition boundary 113650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * when we find a "no" or "maybe". 113750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 113850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * When we back out from prevSrc back to prevBoundary, 113950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * then we also remove those same characters (which had been simply copied 114050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * or canonically-order-inserted) from the ReorderingBuffer. 114150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Therefore, at all times, the [prevBoundary..prevSrc[ source units 114250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * must correspond 1:1 to destination units at the end of the destination buffer. 114350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 114450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *prevBoundary=src; 114527f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 minNoMaybeCP=minCompNoMaybeCP; 114627f654740f2a26ad62a5c155af9199af9e69b889claireho if(limit==NULL) { 114727f654740f2a26ad62a5c155af9199af9e69b889claireho src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 114827f654740f2a26ad62a5c155af9199af9e69b889claireho doCompose ? &buffer : NULL, 114927f654740f2a26ad62a5c155af9199af9e69b889claireho errorCode); 115027f654740f2a26ad62a5c155af9199af9e69b889claireho if(U_FAILURE(errorCode)) { 115127f654740f2a26ad62a5c155af9199af9e69b889claireho return FALSE; 115227f654740f2a26ad62a5c155af9199af9e69b889claireho } 115327f654740f2a26ad62a5c155af9199af9e69b889claireho if(prevBoundary<src) { 115427f654740f2a26ad62a5c155af9199af9e69b889claireho // Set prevBoundary to the last character in the prefix. 115527f654740f2a26ad62a5c155af9199af9e69b889claireho prevBoundary=src-1; 115627f654740f2a26ad62a5c155af9199af9e69b889claireho } 115727f654740f2a26ad62a5c155af9199af9e69b889claireho limit=u_strchr(src, 0); 115827f654740f2a26ad62a5c155af9199af9e69b889claireho } 115927f654740f2a26ad62a5c155af9199af9e69b889claireho 116050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *prevSrc; 116150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c=0; 116250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16=0; 116350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 116450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // only for isNormalized 116550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t prevCC=0; 116650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 116750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 116850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // count code units below the minimum or with irrelevant data for the quick check 116950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(prevSrc=src; src!=limit;) { 117050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( (c=*src)<minNoMaybeCP || 117150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 117250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 117350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++src; 117450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(!U16_IS_SURROGATE(c)) { 117550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 117650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 117750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar c2; 117850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U16_IS_SURROGATE_LEAD(c)) { 117950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 118050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=U16_GET_SUPPLEMENTARY(c, c2); 118150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 118250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else /* trail surrogate */ { 118350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 118450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --src; 118550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=U16_GET_SUPPLEMENTARY(c2, c); 118650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 118750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 118850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 118950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src+=U16_LENGTH(c); 119050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 119150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 119250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 119350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 119450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 119550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // copy these code units all at once 119650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src!=prevSrc) { 119750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(doCompose) { 119850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { 119950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 120050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 120150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 120250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=0; 120350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 120450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src==limit) { 120550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 120650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 120750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Set prevBoundary to the last character in the quick check loop. 120850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src-1; 120950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 121050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_IS_LEAD(*(prevBoundary-1)) 121150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 121250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --prevBoundary; 121350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 121450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The start of the current character (c). 121550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevSrc=src; 121650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(src==limit) { 121750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 121850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 121950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 122050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src+=U16_LENGTH(c); 122150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 122250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 122350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 122450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * or has ccc!=0. 122550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Check for Jamo V/T, then for regular characters. 122650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * c is not a Hangul syllable or Jamo L because those have "yes" properties. 122750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 122850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 122950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar prev=*(prevSrc-1); 123050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool needToDecompose=FALSE; 123150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(c<Hangul::JAMO_T_BASE) { 123250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 123350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prev=(UChar)(prev-Hangul::JAMO_L_BASE); 123450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prev<Hangul::JAMO_L_COUNT) { 123550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!doCompose) { 123650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 123750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 123850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar syllable=(UChar) 123950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (Hangul::HANGUL_BASE+ 124050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 124150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Hangul::JAMO_T_COUNT); 124250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar t; 124350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 124450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++src; 124550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho syllable+=t; // The next character was a Jamo T. 124650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src; 124750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.setLastChar(syllable); 124850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 124950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 125050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If we see L+V+x where x!=T then we drop to the slow path, 125150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // decompose and recompose. 125250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // This is to deal with NFKC finding normal L and V but a 125350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // compatibility variant of a T. We need to either fully compose that 125450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // combination here (which would complicate the code and may not work 125550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // with strange custom data) or use the slow path -- or else our replacing 125650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // two input characters (L+V) with one output character (LV syllable) 125750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // would violate the invariant that [prevBoundary..prevSrc[ has the same 125850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // length as what we appended to the buffer since prevBoundary. 125950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho needToDecompose=TRUE; 126050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 126150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(Hangul::isHangulWithoutJamoT(prev)) { 126250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c is a Jamo Trailing consonant, 126350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // compose with previous Hangul LV that does not contain a Jamo T. 126450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!doCompose) { 126550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 126650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 126750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); 126850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src; 126950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 127050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 127150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!needToDecompose) { 127250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The Jamo V/T did not compose into a Hangul syllable. 127350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(doCompose) { 127450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!buffer.appendBMP((UChar)c, 0, errorCode)) { 127550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 127650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 127750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 127850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=0; 127950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 128050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 128150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 128250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 128350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 128450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Source buffer pointers: 128550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 128650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * all done quick check current char not yet 128750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * "yes" but (c) processed 128850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * may combine 128950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * forward 129050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * [-------------[-------------[-------------[-------------[ 129150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * | | | | | 129250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * orig. src prevBoundary prevSrc src limit 129350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 129450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 129550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Destination buffer pointers inside the ReorderingBuffer: 129650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 129750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * all done might take not filled yet 129850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * characters for 129950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * reordering 130050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * [-------------[-------------[-------------[ 130150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * | | | | 130250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * start reorderStart limit | 130350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * +remainingCap.+ 130450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 130550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norm16>=MIN_YES_YES_WITH_CC) { 130650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t cc=(uint8_t)norm16; // cc!=0 130750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( onlyContiguous && // FCC 130850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (doCompose ? buffer.getLastCC() : prevCC)==0 && 130950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary<prevSrc && 131050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 131150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 131250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // passed the quick check "yes && ccc==0" test. 131350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Check whether the last character was a "yesYes" or a "yesNo". 131450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If a "yesNo", then we get its trailing ccc from its 131550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // mapping and check for canonical order. 131650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // All other cases are ok. 131750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 131850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 131950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Fails FCD test, need to decompose and contiguously recompose. 132050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!doCompose) { 132150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 132250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 132350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(doCompose) { 132450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!buffer.append(c, cc, errorCode)) { 132550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 132650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 132750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 132850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(prevCC<=cc) { 132950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=cc; 133050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 133150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 133250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 133350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 133450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 133550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 133650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 133750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 133850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 133950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Find appropriate boundaries around this character, 134050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * decompose the source text from between the boundaries, 134150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * and recompose it. 134250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 134350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * We may need to remove the last few characters from the ReorderingBuffer 134450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * to account for source text that was copied or appended 134550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * but needs to take part in the recomposition. 134650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 134750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 134850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 134950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Find the last composition boundary in [prevBoundary..src[. 135050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * It is either the decomposition of the current character (at prevSrc), 135150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * or prevBoundary. 135250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 135350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(hasCompBoundaryBefore(c, norm16)) { 135450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=prevSrc; 135550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(doCompose) { 135650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); 135750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 135850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 135950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Find the next composition boundary in [src..limit[ - 136050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // modifies src to point to the next starter. 136150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src=(UChar *)findNextCompBoundary(src, limit); 136250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 136350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 136450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t recomposeStartIndex=buffer.length(); 136550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { 136650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 136750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 136850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho recompose(buffer, recomposeStartIndex, onlyContiguous); 136950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!doCompose) { 137050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!buffer.equals(prevBoundary, src)) { 137150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 137250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 137350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.remove(); 137450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=0; 137550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 137650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 137750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Move to the next starter. We never need to look back before this point again. 137850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src; 137950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 138050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 138150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 138250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 138350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Very similar to compose(): Make the same changes in both places if relevant. 138450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// pQCResult==NULL: spanQuickCheckYes 138550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) 138650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar * 138750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, 138850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool onlyContiguous, 138950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNormalizationCheckResult *pQCResult) const { 139027f654740f2a26ad62a5c155af9199af9e69b889claireho /* 139127f654740f2a26ad62a5c155af9199af9e69b889claireho * prevBoundary points to the last character before the current one 139227f654740f2a26ad62a5c155af9199af9e69b889claireho * that has a composition boundary before it with ccc==0 and quick check "yes". 139327f654740f2a26ad62a5c155af9199af9e69b889claireho */ 139427f654740f2a26ad62a5c155af9199af9e69b889claireho const UChar *prevBoundary=src; 139550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 minNoMaybeCP=minCompNoMaybeCP; 139650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(limit==NULL) { 139750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode errorCode=U_ZERO_ERROR; 139850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); 139927f654740f2a26ad62a5c155af9199af9e69b889claireho if(prevBoundary<src) { 140027f654740f2a26ad62a5c155af9199af9e69b889claireho // Set prevBoundary to the last character in the prefix. 140127f654740f2a26ad62a5c155af9199af9e69b889claireho prevBoundary=src-1; 140227f654740f2a26ad62a5c155af9199af9e69b889claireho } 140350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit=u_strchr(src, 0); 140450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 140550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 140650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *prevSrc; 140750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c=0; 140850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16=0; 140950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t prevCC=0; 141050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 141150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 141250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // count code units below the minimum or with irrelevant data for the quick check 141350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(prevSrc=src;;) { 141450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src==limit) { 141550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return src; 141650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 141750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( (c=*src)<minNoMaybeCP || 141850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 141950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 142050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++src; 142150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(!U16_IS_SURROGATE(c)) { 142250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 142350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 142450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar c2; 142550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U16_IS_SURROGATE_LEAD(c)) { 142650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 142750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=U16_GET_SUPPLEMENTARY(c, c2); 142850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 142950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else /* trail surrogate */ { 143050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 143150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --src; 143250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=U16_GET_SUPPLEMENTARY(c2, c); 143350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 143450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 143550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 143650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src+=U16_LENGTH(c); 143750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 143850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 143950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 144050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 144150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 144250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src!=prevSrc) { 144350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Set prevBoundary to the last character in the quick check loop. 144450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src-1; 144550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 144650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_IS_LEAD(*(prevBoundary-1)) 144750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 144850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --prevBoundary; 144950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 145050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=0; 145150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The start of the current character (c). 145250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevSrc=src; 145350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 145450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 145550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src+=U16_LENGTH(c); 145650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 145750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 145850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 145950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * or has ccc!=0. 146050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 146150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isMaybeOrNonZeroCC(norm16)) { 146250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t cc=getCCFromYesOrMaybe(norm16); 146350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( onlyContiguous && // FCC 146450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho cc!=0 && 146550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC==0 && 146650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary<prevSrc && 146750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // prevCC==0 && prevBoundary<prevSrc tell us that 146850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 146950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // passed the quick check "yes && ccc==0" test. 147050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Check whether the last character was a "yesYes" or a "yesNo". 147150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If a "yesNo", then we get its trailing ccc from its 147250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // mapping and check for canonical order. 147350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // All other cases are ok. 147450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 147550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 147650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Fails FCD test. 147750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(prevCC<=cc || cc==0) { 147850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=cc; 147950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norm16<MIN_YES_YES_WITH_CC) { 148050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(pQCResult!=NULL) { 148150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *pQCResult=UNORM_MAYBE; 148250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 148350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return prevBoundary; 148450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 148550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 148650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 148750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 148850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 148950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(pQCResult!=NULL) { 149050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *pQCResult=UNORM_NO; 149150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 149250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return prevBoundary; 149350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 149450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 149550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 149650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, 149750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool doCompose, 149850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool onlyContiguous, 1499b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString &safeMiddle, 150050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 150150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 150250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!buffer.isEmpty()) { 150350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); 150450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src!=firstStarterInSrc) { 150550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 150650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.getLimit()); 1507b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); 1508b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString middle(lastStarterInDest, destSuffixLength); 1509b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho buffer.removeSuffix(destSuffixLength); 1510b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho safeMiddle=middle; 151150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho middle.append(src, (int32_t)(firstStarterInSrc-src)); 151250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *middleStart=middle.getBuffer(); 151350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compose(middleStart, middleStart+middle.length(), onlyContiguous, 151450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho TRUE, buffer, errorCode); 151550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 151650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 151750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 151850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src=firstStarterInSrc; 151950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 152050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 152150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(doCompose) { 152250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); 152350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 1524b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1525b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho limit=u_strchr(src, 0); 1526b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 152750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.appendZeroCC(src, limit, errorCode); 152850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 152950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 153050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 153150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/** 153250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Does c have a composition boundary before it? 153350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * True if its decomposition begins with a character that has 153450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 153550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 153650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * (isCompYesAndZeroCC()) so we need not decompose. 153750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 153850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 153950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 154050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isCompYesAndZeroCC(norm16)) { 154150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 154250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(isMaybeOrNonZeroCC(norm16)) { 154350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 154450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(isDecompNoAlgorithmic(norm16)) { 154550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=mapAlgorithmic(c, norm16); 154650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=getNorm16(c); 154750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 154850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c decomposes, get everything from the variable-length extra data 154950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *mapping=getMapping(norm16); 1550103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t firstUnit=*mapping; 155150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((firstUnit&MAPPING_LENGTH_MASK)==0) { 155250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 155350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 1554103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) { 155550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; // non-zero leadCC 155650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 1557103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t i=1; // skip over the firstUnit 155850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c; 155950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT_UNSAFE(mapping, i, c); 156050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return isCompYesAndZeroCC(getNorm16(c)); 156150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 156250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 156350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 156450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 156550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { 156650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 156750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16=getNorm16(c); 156850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isInert(norm16)) { 156950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 157050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(norm16<=minYesNo) { 1571103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Hangul: norm16==minYesNo 1572103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Hangul LVT has a boundary after it. 157350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Hangul LV and non-inert yesYes characters combine forward. 157450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); 157550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 157650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 157750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(isDecompNoAlgorithmic(norm16)) { 157850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c=mapAlgorithmic(c, norm16); 157950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 158050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c decomposes, get everything from the variable-length extra data. 158150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // If testInert, then c must be a yesNo character which has lccc=0, 158250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // otherwise it could be a noNo. 158350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *mapping=getMapping(norm16); 158450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t firstUnit=*mapping; 158550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TRUE if 1586103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // not MAPPING_NO_COMP_BOUNDARY_AFTER 1587103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // (which is set if 1588103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // c is not deleted, and 1589103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // it and its decomposition do not combine forward, and it has a starter) 1590103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // and if FCC then trailCC<=1 159150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 1592103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 159350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (!onlyContiguous || firstUnit<=0x1ff); 159450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 159550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 159650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 159750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 159850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { 159950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho BackwardUTrie2StringIterator iter(normTrie, start, p); 160050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16; 160150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { 160250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=iter.previous16(); 160350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 160450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 160550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // but that's probably not worth the extra cost. 160650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return iter.codePointStart; 160750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 160850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 160950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { 161050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ForwardUTrie2StringIterator iter(normTrie, p, limit); 161150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16; 161250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { 161350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=iter.next16(); 161450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 161550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return iter.codePointStart; 161650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 161750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1618103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// Note: normalizer2impl.cpp r30982 (2011-nov-27) 1619103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// still had getFCDTrie() which built and cached an FCD trie. 1620103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// That provided faster access to FCD data than getFCD16FromNormData() 1621103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// but required synchronization and consumed some 10kB of heap memory 1622103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// in any process that uses FCD (e.g., via collation). 1623103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// tccc180[] and smallFCD[] are intended to help with any loss of performance, 1624103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// at least for Latin & CJK. 162550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1626103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// Gets the FCD value from the regular normalization data. 1627103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliusuint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { 162850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Only loops for 1:1 algorithmic mappings. 162950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 1630103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t norm16=getNorm16(c); 1631103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(norm16<=minYesNo) { 163250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // no decomposition or Hangul syllable, all zeros 1633103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return 0; 1634103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 1635103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // combining mark 1636103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius norm16&=0xff; 1637103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return norm16|(norm16<<8); 1638103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(norm16>=minMaybeYes) { 1639103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return 0; 1640103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(isDecompNoAlgorithmic(norm16)) { 1641103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius c=mapAlgorithmic(c, norm16); 164250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 164350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c decomposes, get everything from the variable-length extra data 164450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *mapping=getMapping(norm16); 164550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t firstUnit=*mapping; 164650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((firstUnit&MAPPING_LENGTH_MASK)==0) { 164750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // A character that is deleted (maps to an empty string) must 164850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // get the worst-case lccc and tccc values because arbitrary 164950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // characters on both sides will become adjacent. 1650103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return 0x1ff; 165150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 1652103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius norm16=firstUnit>>8; // tccc 165350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1654103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius norm16|=*(mapping-1)&0xff00; // lccc 165550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 1656103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return norm16; 165750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 165850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 165950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 166050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 166150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 166250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Dual functionality: 166350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// buffer!=NULL: normalize 166450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 166550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar * 166650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2Impl::makeFCD(const UChar *src, const UChar *limit, 166750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer *buffer, 166850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 166927f654740f2a26ad62a5c155af9199af9e69b889claireho // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 167027f654740f2a26ad62a5c155af9199af9e69b889claireho // Similar to the prevBoundary in the compose() implementation. 167127f654740f2a26ad62a5c155af9199af9e69b889claireho const UChar *prevBoundary=src; 167227f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t prevFCD16=0; 167350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(limit==NULL) { 167450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); 167550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 167650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return src; 167750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 167827f654740f2a26ad62a5c155af9199af9e69b889claireho if(prevBoundary<src) { 167927f654740f2a26ad62a5c155af9199af9e69b889claireho prevBoundary=src; 168027f654740f2a26ad62a5c155af9199af9e69b889claireho // We know that the previous character's lccc==0. 168127f654740f2a26ad62a5c155af9199af9e69b889claireho // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1682103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius prevFCD16=getFCD16(*(src-1)); 168327f654740f2a26ad62a5c155af9199af9e69b889claireho if(prevFCD16>1) { 168427f654740f2a26ad62a5c155af9199af9e69b889claireho --prevBoundary; 168527f654740f2a26ad62a5c155af9199af9e69b889claireho } 168627f654740f2a26ad62a5c155af9199af9e69b889claireho } 168750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit=u_strchr(src, 0); 168850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 168950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 169050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Note: In this function we use buffer->appendZeroCC() because we track 169150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // the lead and trail combining classes here, rather than leaving it to 169250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // the ReorderingBuffer. 169350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The exception is the call to decomposeShort() which uses the buffer 169450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // in the normal way. 169550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 169650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *prevSrc; 169750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c=0; 169850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t fcd16=0; 169950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 170050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(;;) { 170150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // count code units with lccc==0 170250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(prevSrc=src; src!=limit;) { 170350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((c=*src)<MIN_CCC_LCCC_CP) { 170450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevFCD16=~c; 170550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++src; 1706103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1707103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius prevFCD16=0; 170850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++src; 170950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 1710103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(U16_IS_SURROGATE(c)) { 1711103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius UChar c2; 1712103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(U16_IS_SURROGATE_LEAD(c)) { 1713103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1714103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius c=U16_GET_SUPPLEMENTARY(c, c2); 1715103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1716103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else /* trail surrogate */ { 1717103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1718103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius --src; 1719103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius c=U16_GET_SUPPLEMENTARY(c2, c); 1720103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 172150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 172250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 1723103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if((fcd16=getFCD16FromNormData(c))<=0xff) { 172450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevFCD16=fcd16; 172550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src+=U16_LENGTH(c); 172650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 172750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 172850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 172950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 173050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 173150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // copy these code units all at once 173250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src!=prevSrc) { 173350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 173450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 173550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 173650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src==limit) { 173750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 173850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 173950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src; 174050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // We know that the previous character's lccc==0. 174150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prevFCD16<0) { 174250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1743103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius UChar32 prev=~prevFCD16; 1744103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 174550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prevFCD16>1) { 174650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --prevBoundary; 174750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 174850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 174950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *p=src-1; 175050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 175150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --p; 175250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Need to fetch the previous character's FCD value because 175350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // prevFCD16 was just for the trail surrogate code point. 1754103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); 175550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 175650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 175750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prevFCD16>1) { 175850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=p; 175950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 176050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 176150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The start of the current character (c). 176250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevSrc=src; 176350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(src==limit) { 176450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 176550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 176650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 176750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src+=U16_LENGTH(c); 176850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 176950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Check for proper order, and decompose locally if necessary. 177050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((prevFCD16&0xff)<=(fcd16>>8)) { 177150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // proper order: prev tccc <= current lccc 177250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if((fcd16&0xff)<=1) { 177350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src; 177450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 177550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { 177650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 177750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 177850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevFCD16=fcd16; 177950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho continue; 178050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(buffer==NULL) { 178150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return prevBoundary; // quick check "no" 178250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 178350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 178450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Back out the part of the source that we copied or appended 178550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * already but is now going to be decomposed. 178650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * prevSrc is set to after what was copied/appended. 178750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 178850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 178950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 179050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Find the part of the source that needs to be decomposed, 179150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * up to the next safe boundary. 179250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 179350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src=findNextFCDBoundary(src, limit); 179450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 179550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The source text does not fulfill the conditions for FCD. 179650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Decompose and reorder a limited piece of the text. 179750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 179850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { 179950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 180050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 180150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevBoundary=src; 180250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevFCD16=0; 180350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 180450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 180550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return src; 180650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 180750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 180850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, 180950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool doMakeFCD, 1810b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString &safeMiddle, 181150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 181250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const { 181350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!buffer.isEmpty()) { 181450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 181550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(src!=firstBoundaryInSrc) { 181650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 181750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.getLimit()); 1818b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); 1819b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString middle(lastBoundaryInDest, destSuffixLength); 1820b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho buffer.removeSuffix(destSuffixLength); 1821b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho safeMiddle=middle; 182250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 182350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *middleStart=middle.getBuffer(); 182450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 182550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 182650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 182750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 182850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src=firstBoundaryInSrc; 182950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 183050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 183150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(doMakeFCD) { 183250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho makeFCD(src, limit, &buffer, errorCode); 183350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 1834b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1835b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho limit=u_strchr(src, 0); 1836b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 183750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.appendZeroCC(src, limit, errorCode); 183850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 183950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 184050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 184150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { 1842103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius while(start<p && previousFCD16(start, p)>0xff) {} 1843103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return p; 184450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 184550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 184650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { 1847103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius while(p<limit) { 1848103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius const UChar *codePointStart=p; 1849103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(nextFCD16(p, limit)<=0xff) { 1850103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return codePointStart; 1851103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1852103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1853103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return p; 185450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 185550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 185627f654740f2a26ad62a5c155af9199af9e69b889claireho// CanonicalIterator data -------------------------------------------------- *** 185727f654740f2a26ad62a5c155af9199af9e69b889claireho 185827f654740f2a26ad62a5c155af9199af9e69b889clairehoCanonIterData::CanonIterData(UErrorCode &errorCode) : 185927f654740f2a26ad62a5c155af9199af9e69b889claireho trie(utrie2_open(0, 0, &errorCode)), 1860103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius canonStartSets(uprv_deleteUObject, NULL, errorCode) {} 186127f654740f2a26ad62a5c155af9199af9e69b889claireho 186227f654740f2a26ad62a5c155af9199af9e69b889clairehoCanonIterData::~CanonIterData() { 186327f654740f2a26ad62a5c155af9199af9e69b889claireho utrie2_close(trie); 186427f654740f2a26ad62a5c155af9199af9e69b889claireho} 186527f654740f2a26ad62a5c155af9199af9e69b889claireho 186627f654740f2a26ad62a5c155af9199af9e69b889clairehovoid CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 186727f654740f2a26ad62a5c155af9199af9e69b889claireho uint32_t canonValue=utrie2_get32(trie, decompLead); 186827f654740f2a26ad62a5c155af9199af9e69b889claireho if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 186927f654740f2a26ad62a5c155af9199af9e69b889claireho // origin is the first character whose decomposition starts with 187027f654740f2a26ad62a5c155af9199af9e69b889claireho // the character for which we are setting the value. 187127f654740f2a26ad62a5c155af9199af9e69b889claireho utrie2_set32(trie, decompLead, canonValue|origin, &errorCode); 187227f654740f2a26ad62a5c155af9199af9e69b889claireho } else { 187327f654740f2a26ad62a5c155af9199af9e69b889claireho // origin is not the first character, or it is U+0000. 187427f654740f2a26ad62a5c155af9199af9e69b889claireho UnicodeSet *set; 187527f654740f2a26ad62a5c155af9199af9e69b889claireho if((canonValue&CANON_HAS_SET)==0) { 187627f654740f2a26ad62a5c155af9199af9e69b889claireho set=new UnicodeSet; 187727f654740f2a26ad62a5c155af9199af9e69b889claireho if(set==NULL) { 187827f654740f2a26ad62a5c155af9199af9e69b889claireho errorCode=U_MEMORY_ALLOCATION_ERROR; 187927f654740f2a26ad62a5c155af9199af9e69b889claireho return; 188027f654740f2a26ad62a5c155af9199af9e69b889claireho } 188127f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); 188227f654740f2a26ad62a5c155af9199af9e69b889claireho canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); 188327f654740f2a26ad62a5c155af9199af9e69b889claireho utrie2_set32(trie, decompLead, canonValue, &errorCode); 188427f654740f2a26ad62a5c155af9199af9e69b889claireho canonStartSets.addElement(set, errorCode); 188527f654740f2a26ad62a5c155af9199af9e69b889claireho if(firstOrigin!=0) { 188627f654740f2a26ad62a5c155af9199af9e69b889claireho set->add(firstOrigin); 188727f654740f2a26ad62a5c155af9199af9e69b889claireho } 188827f654740f2a26ad62a5c155af9199af9e69b889claireho } else { 188927f654740f2a26ad62a5c155af9199af9e69b889claireho set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; 189027f654740f2a26ad62a5c155af9199af9e69b889claireho } 189127f654740f2a26ad62a5c155af9199af9e69b889claireho set->add(origin); 189227f654740f2a26ad62a5c155af9199af9e69b889claireho } 189327f654740f2a26ad62a5c155af9199af9e69b889claireho} 189427f654740f2a26ad62a5c155af9199af9e69b889claireho 189527f654740f2a26ad62a5c155af9199af9e69b889clairehoU_CDECL_BEGIN 189627f654740f2a26ad62a5c155af9199af9e69b889claireho 189727f654740f2a26ad62a5c155af9199af9e69b889claireho// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 189859d709d503bab6e2b61931737e662dd293b40578ccornelius// context: the Normalizer2Impl 189927f654740f2a26ad62a5c155af9199af9e69b889clairehostatic UBool U_CALLCONV 190027f654740f2a26ad62a5c155af9199af9e69b889clairehoenumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 190159d709d503bab6e2b61931737e662dd293b40578ccornelius UErrorCode errorCode = U_ZERO_ERROR; 190259d709d503bab6e2b61931737e662dd293b40578ccornelius if (value != 0) { 190359d709d503bab6e2b61931737e662dd293b40578ccornelius Normalizer2Impl *impl = (Normalizer2Impl *)context; 190459d709d503bab6e2b61931737e662dd293b40578ccornelius impl->makeCanonIterDataFromNorm16( 190559d709d503bab6e2b61931737e662dd293b40578ccornelius start, end, (uint16_t)value, *impl->fCanonIterData, errorCode); 190659d709d503bab6e2b61931737e662dd293b40578ccornelius } 190759d709d503bab6e2b61931737e662dd293b40578ccornelius return U_SUCCESS(errorCode); 190827f654740f2a26ad62a5c155af9199af9e69b889claireho} 190927f654740f2a26ad62a5c155af9199af9e69b889claireho 191027f654740f2a26ad62a5c155af9199af9e69b889claireho 191159d709d503bab6e2b61931737e662dd293b40578ccornelius 191259d709d503bab6e2b61931737e662dd293b40578ccornelius// UInitOnce instantiation function for CanonIterData 191359d709d503bab6e2b61931737e662dd293b40578ccornelius 191459d709d503bab6e2b61931737e662dd293b40578ccorneliusstatic void U_CALLCONV 191559d709d503bab6e2b61931737e662dd293b40578ccorneliusinitCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { 191659d709d503bab6e2b61931737e662dd293b40578ccornelius U_ASSERT(impl->fCanonIterData == NULL); 191759d709d503bab6e2b61931737e662dd293b40578ccornelius impl->fCanonIterData = new CanonIterData(errorCode); 191859d709d503bab6e2b61931737e662dd293b40578ccornelius if (impl->fCanonIterData == NULL) { 191927f654740f2a26ad62a5c155af9199af9e69b889claireho errorCode=U_MEMORY_ALLOCATION_ERROR; 192027f654740f2a26ad62a5c155af9199af9e69b889claireho } 192159d709d503bab6e2b61931737e662dd293b40578ccornelius if (U_SUCCESS(errorCode)) { 192259d709d503bab6e2b61931737e662dd293b40578ccornelius utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl); 192359d709d503bab6e2b61931737e662dd293b40578ccornelius utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode); 192459d709d503bab6e2b61931737e662dd293b40578ccornelius } 192559d709d503bab6e2b61931737e662dd293b40578ccornelius if (U_FAILURE(errorCode)) { 192659d709d503bab6e2b61931737e662dd293b40578ccornelius delete impl->fCanonIterData; 192759d709d503bab6e2b61931737e662dd293b40578ccornelius impl->fCanonIterData = NULL; 192827f654740f2a26ad62a5c155af9199af9e69b889claireho } 192927f654740f2a26ad62a5c155af9199af9e69b889claireho} 193027f654740f2a26ad62a5c155af9199af9e69b889claireho 193159d709d503bab6e2b61931737e662dd293b40578ccorneliusU_CDECL_END 193259d709d503bab6e2b61931737e662dd293b40578ccornelius 193327f654740f2a26ad62a5c155af9199af9e69b889clairehovoid Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 193427f654740f2a26ad62a5c155af9199af9e69b889claireho CanonIterData &newData, 193527f654740f2a26ad62a5c155af9199af9e69b889claireho UErrorCode &errorCode) const { 193627f654740f2a26ad62a5c155af9199af9e69b889claireho if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 193727f654740f2a26ad62a5c155af9199af9e69b889claireho // Inert, or 2-way mapping (including Hangul syllable). 193827f654740f2a26ad62a5c155af9199af9e69b889claireho // We do not write a canonStartSet for any yesNo character. 193927f654740f2a26ad62a5c155af9199af9e69b889claireho // Composites from 2-way mappings are added at runtime from the 194027f654740f2a26ad62a5c155af9199af9e69b889claireho // starter's compositions list, and the other characters in 194127f654740f2a26ad62a5c155af9199af9e69b889claireho // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 194227f654740f2a26ad62a5c155af9199af9e69b889claireho // "maybe" characters. 194327f654740f2a26ad62a5c155af9199af9e69b889claireho return; 194427f654740f2a26ad62a5c155af9199af9e69b889claireho } 194527f654740f2a26ad62a5c155af9199af9e69b889claireho for(UChar32 c=start; c<=end; ++c) { 194627f654740f2a26ad62a5c155af9199af9e69b889claireho uint32_t oldValue=utrie2_get32(newData.trie, c); 194727f654740f2a26ad62a5c155af9199af9e69b889claireho uint32_t newValue=oldValue; 194827f654740f2a26ad62a5c155af9199af9e69b889claireho if(norm16>=minMaybeYes) { 194927f654740f2a26ad62a5c155af9199af9e69b889claireho // not a segment starter if it occurs in a decomposition or has cc!=0 195027f654740f2a26ad62a5c155af9199af9e69b889claireho newValue|=CANON_NOT_SEGMENT_STARTER; 195127f654740f2a26ad62a5c155af9199af9e69b889claireho if(norm16<MIN_NORMAL_MAYBE_YES) { 195227f654740f2a26ad62a5c155af9199af9e69b889claireho newValue|=CANON_HAS_COMPOSITIONS; 195327f654740f2a26ad62a5c155af9199af9e69b889claireho } 195427f654740f2a26ad62a5c155af9199af9e69b889claireho } else if(norm16<minYesNo) { 195527f654740f2a26ad62a5c155af9199af9e69b889claireho newValue|=CANON_HAS_COMPOSITIONS; 195627f654740f2a26ad62a5c155af9199af9e69b889claireho } else { 195727f654740f2a26ad62a5c155af9199af9e69b889claireho // c has a one-way decomposition 195827f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 c2=c; 195927f654740f2a26ad62a5c155af9199af9e69b889claireho uint16_t norm16_2=norm16; 196027f654740f2a26ad62a5c155af9199af9e69b889claireho while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 196127f654740f2a26ad62a5c155af9199af9e69b889claireho c2=mapAlgorithmic(c2, norm16_2); 196227f654740f2a26ad62a5c155af9199af9e69b889claireho norm16_2=getNorm16(c2); 196327f654740f2a26ad62a5c155af9199af9e69b889claireho } 196427f654740f2a26ad62a5c155af9199af9e69b889claireho if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 196527f654740f2a26ad62a5c155af9199af9e69b889claireho // c decomposes, get everything from the variable-length extra data 196627f654740f2a26ad62a5c155af9199af9e69b889claireho const uint16_t *mapping=getMapping(norm16_2); 1967103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint16_t firstUnit=*mapping; 196827f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t length=firstUnit&MAPPING_LENGTH_MASK; 196927f654740f2a26ad62a5c155af9199af9e69b889claireho if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1970103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(c==c2 && (*(mapping-1)&0xff)!=0) { 197127f654740f2a26ad62a5c155af9199af9e69b889claireho newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 197227f654740f2a26ad62a5c155af9199af9e69b889claireho } 197327f654740f2a26ad62a5c155af9199af9e69b889claireho } 197427f654740f2a26ad62a5c155af9199af9e69b889claireho // Skip empty mappings (no characters in the decomposition). 197527f654740f2a26ad62a5c155af9199af9e69b889claireho if(length!=0) { 1976103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius ++mapping; // skip over the firstUnit 197727f654740f2a26ad62a5c155af9199af9e69b889claireho // add c to first code point's start set 197827f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t i=0; 197927f654740f2a26ad62a5c155af9199af9e69b889claireho U16_NEXT_UNSAFE(mapping, i, c2); 198027f654740f2a26ad62a5c155af9199af9e69b889claireho newData.addToStartSet(c, c2, errorCode); 198127f654740f2a26ad62a5c155af9199af9e69b889claireho // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 198227f654740f2a26ad62a5c155af9199af9e69b889claireho // one-way mapping. A 2-way mapping is possible here after 198327f654740f2a26ad62a5c155af9199af9e69b889claireho // intermediate algorithmic mapping. 198427f654740f2a26ad62a5c155af9199af9e69b889claireho if(norm16_2>=minNoNo) { 198527f654740f2a26ad62a5c155af9199af9e69b889claireho while(i<length) { 198627f654740f2a26ad62a5c155af9199af9e69b889claireho U16_NEXT_UNSAFE(mapping, i, c2); 198727f654740f2a26ad62a5c155af9199af9e69b889claireho uint32_t c2Value=utrie2_get32(newData.trie, c2); 198827f654740f2a26ad62a5c155af9199af9e69b889claireho if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 198927f654740f2a26ad62a5c155af9199af9e69b889claireho utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER, 199027f654740f2a26ad62a5c155af9199af9e69b889claireho &errorCode); 199127f654740f2a26ad62a5c155af9199af9e69b889claireho } 199227f654740f2a26ad62a5c155af9199af9e69b889claireho } 199327f654740f2a26ad62a5c155af9199af9e69b889claireho } 199427f654740f2a26ad62a5c155af9199af9e69b889claireho } 199527f654740f2a26ad62a5c155af9199af9e69b889claireho } else { 199627f654740f2a26ad62a5c155af9199af9e69b889claireho // c decomposed to c2 algorithmically; c has cc==0 199727f654740f2a26ad62a5c155af9199af9e69b889claireho newData.addToStartSet(c, c2, errorCode); 199827f654740f2a26ad62a5c155af9199af9e69b889claireho } 199927f654740f2a26ad62a5c155af9199af9e69b889claireho } 200027f654740f2a26ad62a5c155af9199af9e69b889claireho if(newValue!=oldValue) { 200127f654740f2a26ad62a5c155af9199af9e69b889claireho utrie2_set32(newData.trie, c, newValue, &errorCode); 200227f654740f2a26ad62a5c155af9199af9e69b889claireho } 200327f654740f2a26ad62a5c155af9199af9e69b889claireho } 200427f654740f2a26ad62a5c155af9199af9e69b889claireho} 200527f654740f2a26ad62a5c155af9199af9e69b889claireho 200627f654740f2a26ad62a5c155af9199af9e69b889clairehoUBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 200727f654740f2a26ad62a5c155af9199af9e69b889claireho // Logically const: Synchronized instantiation. 200827f654740f2a26ad62a5c155af9199af9e69b889claireho Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 200959d709d503bab6e2b61931737e662dd293b40578ccornelius umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); 201027f654740f2a26ad62a5c155af9199af9e69b889claireho return U_SUCCESS(errorCode); 201127f654740f2a26ad62a5c155af9199af9e69b889claireho} 201227f654740f2a26ad62a5c155af9199af9e69b889claireho 201327f654740f2a26ad62a5c155af9199af9e69b889clairehoint32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 201459d709d503bab6e2b61931737e662dd293b40578ccornelius return (int32_t)utrie2_get32(fCanonIterData->trie, c); 201527f654740f2a26ad62a5c155af9199af9e69b889claireho} 201627f654740f2a26ad62a5c155af9199af9e69b889claireho 201727f654740f2a26ad62a5c155af9199af9e69b889clairehoconst UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 201859d709d503bab6e2b61931737e662dd293b40578ccornelius return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; 201927f654740f2a26ad62a5c155af9199af9e69b889claireho} 202027f654740f2a26ad62a5c155af9199af9e69b889claireho 202127f654740f2a26ad62a5c155af9199af9e69b889clairehoUBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 202227f654740f2a26ad62a5c155af9199af9e69b889claireho return getCanonValue(c)>=0; 202327f654740f2a26ad62a5c155af9199af9e69b889claireho} 202427f654740f2a26ad62a5c155af9199af9e69b889claireho 202527f654740f2a26ad62a5c155af9199af9e69b889clairehoUBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 202627f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 202727f654740f2a26ad62a5c155af9199af9e69b889claireho if(canonValue==0) { 202827f654740f2a26ad62a5c155af9199af9e69b889claireho return FALSE; 202927f654740f2a26ad62a5c155af9199af9e69b889claireho } 203027f654740f2a26ad62a5c155af9199af9e69b889claireho set.clear(); 203127f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t value=canonValue&CANON_VALUE_MASK; 203227f654740f2a26ad62a5c155af9199af9e69b889claireho if((canonValue&CANON_HAS_SET)!=0) { 203327f654740f2a26ad62a5c155af9199af9e69b889claireho set.addAll(getCanonStartSet(value)); 203427f654740f2a26ad62a5c155af9199af9e69b889claireho } else if(value!=0) { 203527f654740f2a26ad62a5c155af9199af9e69b889claireho set.add(value); 203627f654740f2a26ad62a5c155af9199af9e69b889claireho } 203727f654740f2a26ad62a5c155af9199af9e69b889claireho if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 203827f654740f2a26ad62a5c155af9199af9e69b889claireho uint16_t norm16=getNorm16(c); 203927f654740f2a26ad62a5c155af9199af9e69b889claireho if(norm16==JAMO_L) { 204027f654740f2a26ad62a5c155af9199af9e69b889claireho UChar32 syllable= 204127f654740f2a26ad62a5c155af9199af9e69b889claireho (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); 204227f654740f2a26ad62a5c155af9199af9e69b889claireho set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 204327f654740f2a26ad62a5c155af9199af9e69b889claireho } else { 204427f654740f2a26ad62a5c155af9199af9e69b889claireho addComposites(getCompositionsList(norm16), set); 204527f654740f2a26ad62a5c155af9199af9e69b889claireho } 204627f654740f2a26ad62a5c155af9199af9e69b889claireho } 204727f654740f2a26ad62a5c155af9199af9e69b889claireho return TRUE; 204827f654740f2a26ad62a5c155af9199af9e69b889claireho} 204927f654740f2a26ad62a5c155af9199af9e69b889claireho 205050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_END 205150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 205250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Normalizer2 data swapping ----------------------------------------------- *** 205350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 205450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_USE 205550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 205650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CAPI int32_t U_EXPORT2 205750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehounorm2_swap(const UDataSwapper *ds, 205850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const void *inData, int32_t length, void *outData, 205950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode *pErrorCode) { 206050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UDataInfo *pInfo; 206150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t headerSize; 206250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 206350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint8_t *inBytes; 206450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t *outBytes; 206550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 206650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const int32_t *inIndexes; 206750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; 206850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 206950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t i, offset, nextOffset, size; 207050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 207150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* udata_swapDataHeader checks the arguments */ 207250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 207350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 207450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 207550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 207650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 207750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* check data format and format version */ 207850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo=(const UDataInfo *)((const char *)inData+4); 207950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!( 208050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 208150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[1]==0x72 && 208250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[2]==0x6d && 208350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[3]==0x32 && 2084103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) 208550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho )) { 208650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 208750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[0], pInfo->dataFormat[1], 208850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->dataFormat[2], pInfo->dataFormat[3], 208950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho pInfo->formatVersion[0]); 209050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *pErrorCode=U_UNSUPPORTED_ERROR; 209150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 209250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 209350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 209450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho inBytes=(const uint8_t *)inData+headerSize; 209550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho outBytes=(uint8_t *)outData+headerSize; 209650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 209750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho inIndexes=(const int32_t *)inBytes; 209850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 209950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(length>=0) { 210050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho length-=headerSize; 210127f654740f2a26ad62a5c155af9199af9e69b889claireho if(length<(int32_t)sizeof(indexes)) { 210250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 210350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho length); 210450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 210550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 210650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 210750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 210850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 210950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* read the first few indexes */ 211050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { 211150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[i]=udata_readInt32(ds, inIndexes[i]); 211250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 211350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 211450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* get the total length of the data */ 211550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 211650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 211750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(length>=0) { 211850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(length<size) { 211950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 212050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho length); 212150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 212250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 212350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 212450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 212550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* copy the data for inaccessible bytes */ 212650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(inBytes!=outBytes) { 212750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uprv_memcpy(outBytes, inBytes, size); 212850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 212950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 213050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offset=0; 213150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 213250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* swap the int32_t indexes[] */ 213350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 213450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 213550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offset=nextOffset; 213650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 213750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* swap the UTrie2 */ 213850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 213950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 214050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offset=nextOffset; 214150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 214250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* swap the uint16_t extraData[] */ 2143103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; 214450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 214550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offset=nextOffset; 214650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2147103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ 2148103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; 2149103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius offset=nextOffset; 2150103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 215150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_ASSERT(offset==size); 215250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 215350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 215450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return headerSize+size; 215550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 215650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 215750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif // !UCONFIG_NO_NORMALIZATION 2158