16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org********************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Copyright (C) 2001-2011, International Business Machines 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Corporation and others. All Rights Reserved. 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org********************************************************************** 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Date Name Description 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 07/03/01 aliu Creation. 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org********************************************************************** 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_TRANSLITERATION 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/normalizer2.h" 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf16.h" 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cstring.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "nortrans.h" 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline Transliterator::Token cstrToken(const char *s) { 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return Transliterator::pointerToken((void *)s); 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * System registration hook. 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid NormalizationTransliterator::registerIDs() { 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // In the Token, the byte after the NUL is the UNormalization2Mode. 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org _create, cstrToken("nfc\0\0")); 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org _create, cstrToken("nfkc\0\0")); 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org _create, cstrToken("nfc\0\1")); 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org _create, cstrToken("nfkc\0\1")); 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"), 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org _create, cstrToken("nfc\0\2")); 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"), 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org _create, cstrToken("nfc\0\3")); 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UNICODE_STRING_SIMPLE("NFD"), TRUE); 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UNICODE_STRING_SIMPLE("NFKD"), TRUE); 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"), 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UNICODE_STRING_SIMPLE("NFD"), FALSE); 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"), 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UNICODE_STRING_SIMPLE("FCD"), FALSE); 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Factory methods 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgTransliterator* NormalizationTransliterator::_create(const UnicodeString& ID, 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Token context) { 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const char *name = (const char *)context.pointer; 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1]; 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode errorCode = U_ZERO_ERROR; 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode); 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_SUCCESS(errorCode)) { 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new NormalizationTransliterator(ID, *norm2); 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return NULL; 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Constructs a transliterator. 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgNormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id, 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const Normalizer2 &norm2) : 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator(id, 0), fNorm2(norm2) {} 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Destructor. 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgNormalizationTransliterator::~NormalizationTransliterator() { 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copy constructor. 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgNormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Transliterator(o), fNorm2(o.fNorm2) {} 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Transliterator API. 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgTransliterator* NormalizationTransliterator::clone(void) const { 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return new NormalizationTransliterator(*this); 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Implements {@link Transliterator#handleTransliterate}. 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool isIncremental) const { 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // start and limit of the input range 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t start = offsets.start; 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t limit = offsets.limit; 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(start >= limit) { 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Normalize as short chunks at a time as possible even in 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * bulk mode, so that styled text is minimally disrupted. 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * In incremental mode, a chunk that ends with offsets.limit 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * must not be normalized. 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * If it was known that the input text is not styled, then 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * a bulk mode normalization could look like this: 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString input, normalized; 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t length = limit - start; 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org input.releaseBuffer(length); 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fNorm2.normalize(input, normalized, status); 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org text.handleReplaceBetween(start, limit, normalized); 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t delta = normalized.length() - length; 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.contextLimit += delta; 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.limit += delta; 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.start = limit + delta; 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode errorCode = U_ZERO_ERROR; 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString segment; 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString normalized; 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c = text.char32At(start); 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t prev = start; 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip at least one character so we make progress. 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // c holds the character at start. 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org segment.remove(); 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org segment.append(c); 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start += U16_LENGTH(c); 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))); 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // stop in incremental mode when we reach the input limit 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // in case there are additional characters that could change the 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // normalization result 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start=prev; 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fNorm2.normalize(segment, normalized, errorCode); 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(U_FAILURE(errorCode)) { 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(segment != normalized) { 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // replace the input chunk with its normalized form 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org text.handleReplaceBetween(prev, start, normalized); 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // update all necessary indexes accordingly 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t delta = normalized.length() - (start - prev); 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org start += delta; 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org limit += delta; 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } while(start < limit); 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.start = start; 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.contextLimit += limit - offsets.limit; 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offsets.limit = limit; 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 177