150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* 250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho******************************************************************************* 350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Copyright (C) 2009-2014, International Business Machines 550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Corporation and others. All Rights Reserved. 650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho******************************************************************************* 850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* file name: normalizer2impl.h 950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* encoding: US-ASCII 1050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* tab size: 8 (not used) 1150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* indentation:4 1250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 1350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* created on: 2009nov22 1450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* created by: Markus W. Scherer 1550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*/ 1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#ifndef __NORMALIZER2IMPL_H__ 1850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#define __NORMALIZER2IMPL_H__ 1950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/utypes.h" 2150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_NORMALIZATION 2350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/normalizer2.h" 2550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/unistr.h" 2650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/unorm.h" 2783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius#include "unicode/utf16.h" 2850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "mutex.h" 2950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uset_imp.h" 3050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "utrie2.h" 3150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 3250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_BEGIN 3350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 34b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehostruct CanonIterData; 3527f654740f2a26ad62a5c155af9199af9e69b889claireho 36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass U_COMMON_API Hangul { 3750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 3850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* Korean Hangul and Jamo constants */ 3950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum { 4050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_L_BASE=0x1100, /* "lead" jamo */ 41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius JAMO_L_END=0x1112, 4250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_V_BASE=0x1161, /* "vowel" jamo */ 43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius JAMO_V_END=0x1175, 4450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_T_BASE=0x11a7, /* "trail" jamo */ 45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius JAMO_T_END=0x11c2, 4650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 4750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho HANGUL_BASE=0xac00, 48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius HANGUL_END=0xd7a3, 4950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 5050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_L_COUNT=19, 5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_V_COUNT=21, 5250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_T_COUNT=28, 5350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 5450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, 5550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 5650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, 5750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT 5850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 5950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 6050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static inline UBool isHangul(UChar32 c) { 6150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return HANGUL_BASE<=c && c<HANGUL_LIMIT; 6250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 6350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static inline UBool 6450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isHangulWithoutJamoT(UChar c) { 6550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c-=HANGUL_BASE; 6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 6750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 6850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static inline UBool isJamoL(UChar32 c) { 6950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT; 7050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 7150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static inline UBool isJamoV(UChar32 c) { 7250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT; 7350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 7450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 7550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /** 7650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Decomposes c, which must be a Hangul syllable, into buffer 7750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * and returns the length of the decomposition (2 or 3). 7850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 7950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static inline int32_t decompose(UChar32 c, UChar buffer[3]) { 8050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c-=HANGUL_BASE; 8150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c2=c%JAMO_T_COUNT; 8250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c/=JAMO_T_COUNT; 8350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 8450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 8550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(c2==0) { 8650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 2; 8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 8850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer[2]=(UChar)(JAMO_T_BASE+c2); 8950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 3; 9050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 9150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 9283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius 9383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius /** 9483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Decomposes c, which must be a Hangul syllable, into buffer. 9583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * This is the raw, not recursive, decomposition. Its length is always 2. 9683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius */ 9783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) { 9883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar32 orig=c; 9983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius c-=HANGUL_BASE; 10083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar32 c2=c%JAMO_T_COUNT; 10183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(c2==0) { 10283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius c/=JAMO_T_COUNT; 10383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 10483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 10583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else { 10683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius buffer[0]=orig-c2; // LV syllable 10783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius buffer[1]=(UChar)(JAMO_T_BASE+c2); 10883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 10983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 11050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprivate: 11150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Hangul(); // no instantiation 11250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 11350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 11450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass Normalizer2Impl; 11550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass U_COMMON_API ReorderingBuffer : public UMemory { 11750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 11850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : 11950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho impl(ni), str(dest), 12050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start(NULL), reorderStart(NULL), limit(NULL), 12150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity(0), lastCC(0) {} 12250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ~ReorderingBuffer() { 12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start!=NULL) { 12450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho str.releaseBuffer((int32_t)(limit-start)); 12550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 12750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool init(int32_t destCapacity, UErrorCode &errorCode); 12850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 12950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isEmpty() const { return start==limit; } 13050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length() const { return (int32_t)(limit-start); } 13150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *getStart() { return start; } 13250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *getLimit() { return limit; } 13350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t getLastCC() const { return lastCC; } 13450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 13550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool equals(const UChar *start, const UChar *limit) const; 13650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 13850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void setLastChar(UChar c) { 13950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *(limit-1)=c; 14050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 14150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 14250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (c<=0xffff) ? 14450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho appendBMP((UChar)c, cc, errorCode) : 14550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho appendSupplementary(c, cc, errorCode); 14650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 14750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // s must be in NFD, otherwise change the implementation. 14850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool append(const UChar *s, int32_t length, 14950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t leadCC, uint8_t trailCC, 15050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode); 15150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) { 15250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(remainingCapacity==0 && !resize(1, errorCode)) { 15350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 15450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 15550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(lastCC<=cc || cc==0) { 15650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *limit++=c; 15750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=cc; 15850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cc<=1) { 15950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit; 16050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 16150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 16250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho insert(c, cc); 16350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 16450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --remainingCapacity; 16550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 16650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 16750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool appendZeroCC(UChar32 c, UErrorCode &errorCode); 16850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode); 16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void remove(); 17050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void removeSuffix(int32_t suffixLength); 17150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void setReorderingLimit(UChar *newLimit) { 17250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho remainingCapacity+=(int32_t)(limit-newLimit); 17350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho reorderStart=limit=newLimit; 17450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lastCC=0; 17550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 176b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho void copyReorderableSuffixTo(UnicodeString &s) const { 177b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho s.setTo(reorderStart, (int32_t)(limit-reorderStart)); 178b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 17950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprivate: 18050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* 18150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * TODO: Revisit whether it makes sense to track reorderStart. 18250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * It is set to after the last known character with cc<=1, 18350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * which stops previousCC() before it reads that character and looks up its cc. 18450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * previousCC() is normally only called from insert(). 18550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * In other words, reorderStart speeds up the insertion of a combining mark 18650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * into a multi-combining mark sequence where it does not belong at the end. 18750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * This might not be worth the trouble. 18850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * On the other hand, it's not a huge amount of trouble. 18950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 19050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * We probably need it for UNORM_SIMPLE_APPEND. 19150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 19250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 19350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode); 19450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void insert(UChar32 c, uint8_t cc); 19550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static void writeCodePoint(UChar *p, UChar32 c) { 19650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(c<=0xffff) { 19750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *p=(UChar)c; 19850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 19950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p[0]=U16_LEAD(c); 20050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p[1]=U16_TRAIL(c); 20150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 20250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 20350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool resize(int32_t appendLength, UErrorCode &errorCode); 20450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 20550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Normalizer2Impl &impl; 20650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString &str; 20750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *start, *reorderStart, *limit; 20850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t remainingCapacity; 20950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t lastCC; 21050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 21150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // private backward iterator 21250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void setIterator() { codePointStart=limit; } 21350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void skipPrevious(); // Requires start<codePointStart. 21450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t previousCC(); // Returns 0 if there is no previous character. 21550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 21650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar *codePointStart, *codePointLimit; 21750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 21850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 219f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusclass U_COMMON_API Normalizer2Impl : public UObject { 22050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 221f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) { 22259d709d503bab6e2b61931737e662dd293b40578ccornelius fCanonIterDataInitOnce.reset(); 22350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 224f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius virtual ~Normalizer2Impl(); 22550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 226f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius void init(const int32_t *inIndexes, const UTrie2 *inTrie, 227f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const uint16_t *inExtraData, const uint8_t *inSmallFCD); 22850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void addLcccChars(UnicodeSet &set) const; 23050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 23127f654740f2a26ad62a5c155af9199af9e69b889claireho void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 23250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 23350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // low-level properties ------------------------------------------------ *** 23450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 23550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UTrie2 *getNormTrie() const { return normTrie; } 23650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 23727f654740f2a26ad62a5c155af9199af9e69b889claireho UBool ensureCanonIterData(UErrorCode &errorCode) const; 23827f654740f2a26ad62a5c155af9199af9e69b889claireho 23950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); } 24050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 24150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { 24250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 24350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return UNORM_YES; 24450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(minMaybeYes<=norm16) { 24550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return UNORM_MAYBE; 24650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 24750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return UNORM_NO; 24850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 24950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeYes; } 25150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; } 25250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; } 25350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 25450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t getCC(uint16_t norm16) const { 25550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norm16>=MIN_NORMAL_MAYBE_YES) { 25650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return (uint8_t)norm16; 25750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 25850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norm16<minNoNo || limitNoNo<=norm16) { 25950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 26050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 26150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return getCCFromNoNo(norm16); 26250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 26350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static uint8_t getCCFromYesOrMaybe(uint16_t norm16) { 26450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; 26550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 26650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 26783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius /** 26883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Returns the FCD data for code point c. 26983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @param c A Unicode code point. 27083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 27183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius */ 27283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint16_t getFCD16(UChar32 c) const { 27383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(c<0) { 27483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return 0; 27583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else if(c<0x180) { 27683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return tccc180[c]; 27783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else if(c<=0xffff) { 27883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 27983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 28083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return getFCD16FromNormData(c); 28150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 28283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius /** 28383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Returns the FCD data for the next code point (post-increment). 28483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Might skip only a lead surrogate rather than the whole surrogate pair if none of 28583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * the supplementary code points associated with the lead surrogate have non-zero FCD data. 28683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @param s A valid pointer into a string. Requires s!=limit. 28783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @param limit The end of the string, or NULL. 28883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 28983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius */ 29083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { 29183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar32 c=*s++; 29283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(c<0x180) { 29383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return tccc180[c]; 29483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 29583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return 0; 29683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 29783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar c2; 29883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { 29983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius c=U16_GET_SUPPLEMENTARY(c, c2); 30083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius ++s; 30183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 30283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return getFCD16FromNormData(c); 30350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 30483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius /** 30583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Returns the FCD data for the previous code point (pre-decrement). 30683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @param start The start of the string. 30783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @param s A valid pointer into a string. Requires start<s. 30883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 30983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius */ 31083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint16_t previousFCD16(const UChar *start, const UChar *&s) const { 31183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar32 c=*--s; 31283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(c<0x180) { 31383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return tccc180[c]; 31483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 31583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(!U16_IS_TRAIL(c)) { 31683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(!singleLeadMightHaveNonZeroFCD16(c)) { 31783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return 0; 31883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 31983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else { 32083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar c2; 32183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(start<s && U16_IS_LEAD(c2=*(s-1))) { 32283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius c=U16_GET_SUPPLEMENTARY(c2, c); 32383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius --s; 32483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 32583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 32683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return getFCD16FromNormData(c); 32750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 32850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 32983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius /** Returns the FCD data for U+0000<=c<U+0180. */ 33083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; } 33183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius /** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */ 33283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const { 33383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // 0<=lead<=0xffff 33483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint8_t bits=smallFCD[lead>>8]; 33583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(bits==0) { return false; } 33683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return (UBool)((bits>>((lead>>5)&7))&1); 33783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 33883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius /** Returns the FCD value from the regular normalization data. */ 33983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint16_t getFCD16FromNormData(UChar32 c) const; 34050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 34127f654740f2a26ad62a5c155af9199af9e69b889claireho void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 34227f654740f2a26ad62a5c155af9199af9e69b889claireho CanonIterData &newData, UErrorCode &errorCode) const; 34327f654740f2a26ad62a5c155af9199af9e69b889claireho 34450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /** 34583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Gets the decomposition for one code point. 34650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @param c code point 34750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @param buffer out-only buffer for algorithmic decompositions 34850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @param length out-only, takes the length of the decomposition, if any 34950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @return pointer to the decomposition, or NULL if none 35050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 35150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; 35250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 35383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius /** 35483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Gets the raw decomposition for one code point. 35583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @param c code point 35683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @param buffer out-only buffer for algorithmic decompositions 35783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @param length out-only, takes the length of the decomposition, if any 35883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @return pointer to the decomposition, or NULL if none 35983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius */ 36083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; 36183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius 36283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar32 composePair(UChar32 a, UChar32 b) const; 36383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius 36427f654740f2a26ad62a5c155af9199af9e69b889claireho UBool isCanonSegmentStarter(UChar32 c) const; 36527f654740f2a26ad62a5c155af9199af9e69b889claireho UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; 36627f654740f2a26ad62a5c155af9199af9e69b889claireho 36750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum { 36850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho MIN_CCC_LCCC_CP=0x300 36950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 37050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 37150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum { 37250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho MIN_YES_YES_WITH_CC=0xff01, 37350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_VT=0xff00, 37450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho MIN_NORMAL_MAYBE_YES=0xfe00, 37550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho JAMO_L=1, 37650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho MAX_DELTA=0x40 37750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 37850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 37950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum { 38050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Byte offsets from the start of the data, after the generic header. 38150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_NORM_TRIE_OFFSET, 38250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_EXTRA_DATA_OFFSET, 38383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius IX_SMALL_FCD_OFFSET, 38450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_RESERVED3_OFFSET, 38550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_RESERVED4_OFFSET, 38650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_RESERVED5_OFFSET, 38750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_RESERVED6_OFFSET, 38850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_TOTAL_SIZE, 38950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 39050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Code point thresholds for quick check codes. 39150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_MIN_DECOMP_NO_CP, 39250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_MIN_COMP_NO_MAYBE_CP, 39350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 39450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Norm16 value thresholds for quick check combinations and types of extra data. 39583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 39650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_MIN_NO_NO, 39750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_LIMIT_NO_NO, 39850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_MIN_MAYBE_YES, 39950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 40083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[. 40183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius 40250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_RESERVED15, 40350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IX_COUNT 40450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 40550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 40650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum { 40750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho MAPPING_HAS_CCC_LCCC_WORD=0x80, 40883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius MAPPING_HAS_RAW_MAPPING=0x40, 40950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, 41050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho MAPPING_LENGTH_MASK=0x1f 41150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 41250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 41350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum { 41450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho COMP_1_LAST_TUPLE=0x8000, 41550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho COMP_1_TRIPLE=1, 41650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho COMP_1_TRAIL_LIMIT=0x3400, 41750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho COMP_1_TRAIL_MASK=0x7ffe, 41850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit 41950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho COMP_2_TRAIL_SHIFT=6, 42050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho COMP_2_TRAIL_MASK=0xffc0 42150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 42250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 42350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // higher-level functionality ------------------------------------------ *** 42450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // NFD without an NFD Normalizer2 instance. 426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, 427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) const; 428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Decomposes [src, limit[ and writes the result to dest. 430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * limit can be NULL if src is NUL-terminated. 431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * destLengthEstimate is the initial dest buffer capacity and can be -1. 432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void decompose(const UChar *src, const UChar *limit, 434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString &dest, int32_t destLengthEstimate, 435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) const; 436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 43750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *decompose(const UChar *src, const UChar *limit, 43850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer *buffer, UErrorCode &errorCode) const; 43950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void decomposeAndAppend(const UChar *src, const UChar *limit, 44050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool doDecompose, 441b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString &safeMiddle, 44250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 44350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const; 44450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool compose(const UChar *src, const UChar *limit, 44550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool onlyContiguous, 44650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool doCompose, 44750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 44850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const; 44950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *composeQuickCheck(const UChar *src, const UChar *limit, 45050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool onlyContiguous, 45150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNormalizationCheckResult *pQCResult) const; 45250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void composeAndAppend(const UChar *src, const UChar *limit, 45350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool doCompose, 45450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool onlyContiguous, 455b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString &safeMiddle, 45650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 45750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const; 45850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *makeFCD(const UChar *src, const UChar *limit, 45950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer *buffer, UErrorCode &errorCode) const; 46050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void makeFCDAndAppend(const UChar *src, const UChar *limit, 46150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool doMakeFCD, 462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString &safeMiddle, 46350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, 46450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const; 46550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 46650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool hasDecompBoundary(UChar32 c, UBool before) const; 46750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } 46850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 46950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool hasCompBoundaryBefore(UChar32 c) const { 47050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 47150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 47250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const; 47350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 47450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } 47550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool hasFCDBoundaryAfter(UChar32 c) const { 47650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t fcd16=getFCD16(c); 47750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return fcd16<=1 || (fcd16&0xff)==0; 47850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 47950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } 48050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprivate: 48150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 48250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; } 48350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static UBool isInert(uint16_t norm16) { return norm16==0; } 48483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius static UBool isJamoL(uint16_t norm16) { return norm16==1; } 48550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } 48650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } 48750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; } 48850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // UBool isCompYes(uint16_t norm16) const { 48950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 49050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // } 49150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // UBool isCompYesOrMaybe(uint16_t norm16) const { 49250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // return norm16<minNoNo || minMaybeYes<=norm16; 49350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // } 49450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // UBool hasZeroCCFromDecompYes(uint16_t norm16) const { 49550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 49650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // } 49750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isDecompYesAndZeroCC(uint16_t norm16) const { 49850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return norm16<minYesNo || 49950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16==JAMO_VT || 50050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 50150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 50250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /** 50350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * A little faster and simpler than isDecompYesAndZeroCC() but does not include 50450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * the MaybeYes which combine-forward and have ccc=0. 50550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * (Standard Unicode 5.2 normalization does not have such characters.) 50650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 50750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isMostDecompYesAndZeroCC(uint16_t norm16) const { 50850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 50950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 51050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; } 51150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 51250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // For use with isCompYes(). 51350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 51450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // static uint8_t getCCFromYes(uint16_t norm16) { 51550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 51650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // } 51750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t getCCFromNoNo(uint16_t norm16) const { 51850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *mapping=getMapping(norm16); 51950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { 52083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return (uint8_t)*(mapping-1); 52150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 52250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return 0; 52350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 52450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 52550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 52650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const; 52750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 52850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Requires algorithmic-NoNo. 52950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { 53050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return c+norm16-(minMaybeYes-MAX_DELTA-1); 53150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 53250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 53350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Requires minYesNo<norm16<limitNoNo. 53450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; } 53550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { 53650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 53750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return NULL; 53850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(norm16<minMaybeYes) { 53950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return extraData+norm16; // for yesYes; if Jamo L: harmless empty list 54050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 54150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return maybeYesCompositions+norm16-minMaybeYes; 54250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 54350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 54450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *getCompositionsListForComposite(uint16_t norm16) const { 54550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list 54650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return list+ // mapping pointer 54750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1+ // +1 to skip the first unit with the mapping lenth 54883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius (*list&MAPPING_LENGTH_MASK); // + mapping length 54950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 55027f654740f2a26ad62a5c155af9199af9e69b889claireho /** 55127f654740f2a26ad62a5c155af9199af9e69b889claireho * @param c code point must have compositions 55227f654740f2a26ad62a5c155af9199af9e69b889claireho * @return compositions list pointer 55327f654740f2a26ad62a5c155af9199af9e69b889claireho */ 55427f654740f2a26ad62a5c155af9199af9e69b889claireho const uint16_t *getCompositionsList(uint16_t norm16) const { 55527f654740f2a26ad62a5c155af9199af9e69b889claireho return isDecompYes(norm16) ? 55627f654740f2a26ad62a5c155af9199af9e69b889claireho getCompositionsListForDecompYes(norm16) : 55727f654740f2a26ad62a5c155af9199af9e69b889claireho getCompositionsListForComposite(norm16); 55827f654740f2a26ad62a5c155af9199af9e69b889claireho } 55950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 56050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *copyLowPrefixFromNulTerminated(const UChar *src, 56150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 minNeedDataCP, 56250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer *buffer, 56350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &errorCode) const; 56450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool decomposeShort(const UChar *src, const UChar *limit, 56550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, UErrorCode &errorCode) const; 56650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool decompose(UChar32 c, uint16_t norm16, 56750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ReorderingBuffer &buffer, UErrorCode &errorCode) const; 56850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 56950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static int32_t combine(const uint16_t *list, UChar32 trail); 57027f654740f2a26ad62a5c155af9199af9e69b889claireho void addComposites(const uint16_t *list, UnicodeSet &set) const; 57150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 57250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool onlyContiguous) const; 57350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 57450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const; 57550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const; 57650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; 57750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 57850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const; 57950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; 58050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 58127f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t getCanonValue(UChar32 c) const; 58227f654740f2a26ad62a5c155af9199af9e69b889claireho const UnicodeSet &getCanonStartSet(int32_t n) const; 58327f654740f2a26ad62a5c155af9199af9e69b889claireho 584f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // UVersionInfo dataVersion; 58550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 58650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Code point thresholds for quick check codes. 58750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 minDecompNoCP; 58850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 minCompNoMaybeCP; 58950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 59050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Norm16 value thresholds for quick check combinations and types of extra data. 59150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t minYesNo; 59283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint16_t minYesNoMappingsOnly; 59350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t minNoNo; 59450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t limitNoNo; 59550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t minMaybeYes; 59650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 597f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const UTrie2 *normTrie; 59850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *maybeYesCompositions; 59950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 60083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 60183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F 60250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 603f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliuspublic: // CanonIterData is public to allow access from C callback functions. 60459d709d503bab6e2b61931737e662dd293b40578ccornelius UInitOnce fCanonIterDataInitOnce; 60559d709d503bab6e2b61931737e662dd293b40578ccornelius CanonIterData *fCanonIterData; 60650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 60750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 60827f654740f2a26ad62a5c155af9199af9e69b889claireho// bits in canonIterData 60927f654740f2a26ad62a5c155af9199af9e69b889claireho#define CANON_NOT_SEGMENT_STARTER 0x80000000 61027f654740f2a26ad62a5c155af9199af9e69b889claireho#define CANON_HAS_COMPOSITIONS 0x40000000 61127f654740f2a26ad62a5c155af9199af9e69b889claireho#define CANON_HAS_SET 0x200000 61227f654740f2a26ad62a5c155af9199af9e69b889claireho#define CANON_VALUE_MASK 0x1fffff 61327f654740f2a26ad62a5c155af9199af9e69b889claireho 61450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/** 61550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * ICU-internal shortcut for quick access to standard Unicode normalization. 61650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 61750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass U_COMMON_API Normalizer2Factory { 61850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 61950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); 62050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); 62150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); 62250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 62350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode); 62450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 62550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode); 62650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode); 62750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode); 62850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 62950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Get the Impl instance of the Normalizer2. 63050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. 63150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Normalizer2Impl *getImpl(const Normalizer2 *norm2); 63250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprivate: 63350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Normalizer2Factory(); // No instantiation. 63450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 63550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 63650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_END 63750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 63850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CAPI int32_t U_EXPORT2 63950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehounorm2_swap(const UDataSwapper *ds, 64050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const void *inData, int32_t length, void *outData, 64150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode *pErrorCode); 64250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 64350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/** 64450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). 64550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @internal 64650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 64783a171d1a62abf406f7f44ae671823d5ec20db7dCraig CorneliusU_CFUNC UNormalizationCheckResult 64850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehounorm_getQuickCheck(UChar32 c, UNormalizationMode mode); 64950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 65050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/** 65183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue(). 65250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * @internal 65350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 65483a171d1a62abf406f7f44ae671823d5ec20db7dCraig CorneliusU_CFUNC uint16_t 65583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Corneliusunorm_getFCD16(UChar32 c); 65650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 65750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/** 65850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Format of Normalizer2 .nrm data files. 65983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Format version 2.0. 66050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 66150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms. 66250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * ICU ships with data files for standard Unicode Normalization Forms 66350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm). 66450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Custom (application-specific) data can be built into additional .nrm files 66550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * with the gennorm2 build tool. 66650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 66750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been 66850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * cached already. Internally, Normalizer2Impl.load() reads the .nrm file. 66950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 67050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * A .nrm file begins with a standard ICU data file header 67150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * (DataHeader, see ucmndata.h and unicode/udata.h). 67250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The UDataInfo.dataVersion field usually contains the Unicode version 67350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * for which the data was generated. 67450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 67550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * After the header, the file contains the following parts. 67650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Constants are defined as enum values of the Normalizer2Impl class. 67750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 67850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Many details of the data structures are described in the design doc 67950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * which is at http://site.icu-project.org/design/normalization/custom 68050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 68150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4; 68250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 68350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The first eight indexes are byte offsets in ascending order. 68450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Each byte offset marks the start of the next part in the data file, 68550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * and the end of the previous one. 68650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * When two consecutive byte offsets are the same, then the corresponding part is empty. 68750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Byte offsets are offsets from after the header, 68850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * that is, from the beginning of the indexes[]. 68950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Each part starts at an offset with proper alignment for its data. 69050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * If necessary, the previous part may include padding bytes to achieve this alignment. 69150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 69250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point 69350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * with a decomposition mapping, that is, with NF*D_QC=No. 69450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point 69550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward). 69650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 69783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * The next five indexes are thresholds of 16-bit trie values for ranges of 69850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * values indicating multiple normalization properties. 69950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * minYesNo=indexes[IX_MIN_YES_NO]; 70050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * minNoNo=indexes[IX_MIN_NO_NO]; 70150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * limitNoNo=indexes[IX_LIMIT_NO_NO]; 70250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * minMaybeYes=indexes[IX_MIN_MAYBE_YES]; 70383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 70450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * See the normTrie description below and the design doc for details. 70550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 70650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h 70750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 70850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The trie holds the main normalization data. Each code point is mapped to a 16-bit value. 70950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Rather than using independent bits in the value (which would require more than 16 bits), 71050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * information is extracted primarily via range checks. 71150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo 71250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * means that the character has NF*C_QC=Yes and NF*D_QC=No properties, 71350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * which means it has a two-way (round-trip) decomposition mapping. 71450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData 71583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * pointing to mappings, compositions lists, or both. 71650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Value norm16==0 means that the character is normalization-inert, that is, 71750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * it does not have a mapping, does not participate in composition, has a zero 71850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * canonical combining class, and forms a boundary where text before it and after it 71950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * can be normalized independently. 72050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * For details about how multiple properties are encoded in 16-bit values 72150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * see the design doc. 72250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Note that the encoding cannot express all combinations of the properties involved; 72350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * it only supports those combinations that are allowed by 72450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * the Unicode Normalization algorithms. Details are in the design doc as well. 72550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The gennorm2 tool only builds .nrm files for data that conforms to the limitations. 72650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 72750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The trie has a value for each lead surrogate code unit representing the "worst case" 72850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * properties of the 1024 supplementary characters whose UTF-16 form starts with 72950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * the lead surrogate. If all of the 1024 supplementary characters are normalization-inert, 73050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * then their lead surrogate code unit has the trie value 0. 73150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * When the lead surrogate unit's value exceeds the quick check minimum during processing, 73250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * the properties for the full supplementary code point need to be looked up. 73350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 73450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]; 73550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * uint16_t extraData[]; 73650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 73750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * There is only one byte offset for the end of these two arrays. 73850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The split between them is given by the constant and variable mentioned above. 73950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 74083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * The maybeYesCompositions array contains compositions lists for characters that 74150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * combine both forward (as starters in composition pairs) 74250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * and backward (as trailing characters in composition pairs). 74350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Such characters do not occur in Unicode 5.2 but are allowed by 74450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * the Unicode Normalization algorithms. 74550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES 74650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * and the maybeYesCompositions array is empty. 74750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * If there are such characters, then minMaybeYes is subtracted from their norm16 values 74850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * to get the index into this array. 74950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 75083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * The extraData array contains compositions lists for "YesYes" characters, 75183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * followed by mappings and optional compositions lists for "YesNo" characters, 75250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * followed by only mappings for "NoNo" characters. 75350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * (Referring to pairs of NFC/NFD quick check values.) 75450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * The norm16 values of those characters are directly indexes into the extraData array. 75550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 75683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * The data structures for compositions lists and mappings are described in the design doc. 75783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * 75883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * uint8_t smallFCD[0x100]; -- new in format version 2 75983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * 76083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * This is a bit set to help speed up FCD value lookups in the absence of a full 76183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * UTrie2 or other large data structure with the full FCD value mapping. 76283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * 76383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Each smallFCD bit is set if any of the corresponding 32 BMP code points 76483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * has a non-zero FCD value (lccc!=0 or tccc!=0). 76583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF. 76683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * A bit for 32 lead surrogates is set if any of the 32k corresponding 76783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * _supplementary_ code points has a non-zero FCD value. 76883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * 76983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * This bit set is most useful for the large blocks of CJK characters with FCD=0. 77083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * 77183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Changes from format version 1 to format version 2 --------------------------- 77283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * 77383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * - Addition of data for raw (not recursively decomposed) mappings. 77483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when 77583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * the mapping is to an empty string or when the character combines-forward. 77683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which 77783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * is then repurposed for the MAPPING_HAS_RAW_MAPPING bit. 77883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * + For details see the design doc. 77983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into 78083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * distinct ranges (combines-forward vs. not) 78183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * so that a range check can be used to find out if there is a compositions list. 78283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag. 78383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * It is needed for the new (in ICU 49) composePair(), not for other normalization. 78483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * - Addition of the smallFCD[] bit set. 78550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 78650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 78750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif /* !UCONFIG_NO_NORMALIZATION */ 78850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif /* __NORMALIZER2IMPL_H__ */ 789