1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Copyright (C) 2001-2010, International Business Machines 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Date Name Description 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 07/03/01 aliu Creation. 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_TRANSLITERATION 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/normalizer2.h" 1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "cstring.h" 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "nortrans.h" 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic inline Transliterator::Token cstrToken(const char *s) { 2450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return Transliterator::pointerToken((void *)s); 2550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 2650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * System registration hook. 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid NormalizationTransliterator::registerIDs() { 3150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // In the Token, the byte after the NUL is the UNormalization2Mode. 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), 3350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfc\0\0")); 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), 3550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfkc\0\0")); 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), 3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfc\0\1")); 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), 3950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfkc\0\1")); 4050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"), 4150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfc\0\2")); 4250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"), 4350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfc\0\3")); 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UNICODE_STRING_SIMPLE("NFD"), TRUE); 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UNICODE_STRING_SIMPLE("NFKD"), TRUE); 4850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"), 4950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNICODE_STRING_SIMPLE("NFD"), FALSE); 5050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"), 5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNICODE_STRING_SIMPLE("FCD"), FALSE); 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Factory methods 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruTransliterator* NormalizationTransliterator::_create(const UnicodeString& ID, 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Token context) { 5950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const char *name = (const char *)context.pointer; 6050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1]; 6150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode errorCode = U_ZERO_ERROR; 6250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode); 6350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_SUCCESS(errorCode)) { 6450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return new NormalizationTransliterator(ID, *norm2); 6550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return NULL; 6750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Constructs a transliterator. 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 7350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id, 7450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Normalizer2 &norm2) : 7550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator(id, 0), fNorm2(norm2) {} 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Destructor. 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruNormalizationTransliterator::~NormalizationTransliterator() { 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Copy constructor. 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruNormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : 8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator(o), fNorm2(o.fNorm2) {} 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Transliterator API. 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruTransliterator* NormalizationTransliterator::clone(void) const { 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return new NormalizationTransliterator(*this); 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Implements {@link Transliterator#handleTransliterate}. 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isIncremental) const { 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // start and limit of the input range 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start = offsets.start; 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t limit = offsets.limit; 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(start >= limit) { 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Normalize as short chunks at a time as possible even in 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * bulk mode, so that styled text is minimally disrupted. 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * In incremental mode, a chunk that ends with offsets.limit 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * must not be normalized. 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * If it was known that the input text is not styled, then 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * a bulk mode normalization could look like this: 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 11750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString input, normalized; 11850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length = limit - start; 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru input.releaseBuffer(length); 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fNorm2.normalize(input, normalized, status); 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho text.handleReplaceBetween(start, limit, normalized); 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t delta = normalized.length() - length; 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offsets.contextLimit += delta; 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offsets.limit += delta; 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offsets.start = limit + delta; 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 13350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode errorCode = U_ZERO_ERROR; 13450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString segment; 13550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString normalized; 13650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c = text.char32At(start); 13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { 13850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t prev = start; 13950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Skip at least one character so we make progress. 14050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c holds the character at start. 14150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho segment.remove(); 14250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { 14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho segment.append(c); 14450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start += U16_LENGTH(c); 14550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))); 14650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // stop in incremental mode when we reach the input limit 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // in case there are additional characters that could change the 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalization result 15050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start=prev; 15150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 15350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fNorm2.normalize(segment, normalized, errorCode); 15450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 15550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 15650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 15750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(segment != normalized) { 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // replace the input chunk with its normalized form 15950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho text.handleReplaceBetween(prev, start, normalized); 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // update all necessary indexes accordingly 16250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t delta = normalized.length() - (start - prev); 16350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start += delta; 16450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit += delta; 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 16650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } while(start < limit); 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offsets.start = start; 16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offsets.contextLimit += limit - offsets.limit; 17050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offsets.limit = limit; 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 176