164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/* 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius* Copyright (C) 2001-2011, International Business Machines 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* Date Name Description 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru* 07/03/01 aliu Creation. 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru********************************************************************** 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/ 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_TRANSLITERATION 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/normalizer2.h" 1883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius#include "unicode/utf16.h" 1950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "cstring.h" 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "nortrans.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic inline Transliterator::Token cstrToken(const char *s) { 2750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return Transliterator::pointerToken((void *)s); 2850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 2950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * System registration hook. 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid NormalizationTransliterator::registerIDs() { 3450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // In the Token, the byte after the NUL is the UNormalization2Mode. 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), 3650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfc\0\0")); 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), 3850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfkc\0\0")); 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), 4050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfc\0\1")); 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), 4250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfkc\0\1")); 4350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"), 4450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfc\0\2")); 4550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"), 4650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho _create, cstrToken("nfc\0\3")); 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UNICODE_STRING_SIMPLE("NFD"), TRUE); 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UNICODE_STRING_SIMPLE("NFKD"), TRUE); 5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"), 5250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNICODE_STRING_SIMPLE("NFD"), FALSE); 5350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"), 5450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNICODE_STRING_SIMPLE("FCD"), FALSE); 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Factory methods 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruTransliterator* NormalizationTransliterator::_create(const UnicodeString& ID, 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru Token context) { 6250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const char *name = (const char *)context.pointer; 6350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1]; 6450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode errorCode = U_ZERO_ERROR; 6550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode); 6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_SUCCESS(errorCode)) { 6750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return new NormalizationTransliterator(ID, *norm2); 6850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 6950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return NULL; 7050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Constructs a transliterator. 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 7650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id, 7750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Normalizer2 &norm2) : 7850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator(id, 0), fNorm2(norm2) {} 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Destructor. 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruNormalizationTransliterator::~NormalizationTransliterator() { 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Copy constructor. 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruNormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : 9050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Transliterator(o), fNorm2(o.fNorm2) {} 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Transliterator API. 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruTransliterator* NormalizationTransliterator::clone(void) const { 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return new NormalizationTransliterator(*this); 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/** 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Implements {@link Transliterator#handleTransliterate}. 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool isIncremental) const { 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // start and limit of the input range 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t start = offsets.start; 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t limit = offsets.limit; 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if(start >= limit) { 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /* 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Normalize as short chunks at a time as possible even in 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * bulk mode, so that styled text is minimally disrupted. 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * In incremental mode, a chunk that ends with offsets.limit 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * must not be normalized. 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * If it was known that the input text is not styled, then 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * a bulk mode normalization could look like this: 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString input, normalized; 12150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length = limit - start; 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru input.releaseBuffer(length); 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fNorm2.normalize(input, normalized, status); 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho text.handleReplaceBetween(start, limit, normalized); 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 13050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t delta = normalized.length() - length; 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offsets.contextLimit += delta; 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offsets.limit += delta; 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offsets.start = limit + delta; 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */ 13650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode errorCode = U_ZERO_ERROR; 13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString segment; 13850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString normalized; 13950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c = text.char32At(start); 14050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { 14150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t prev = start; 14250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Skip at least one character so we make progress. 14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // c holds the character at start. 14450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho segment.remove(); 14550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { 14650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho segment.append(c); 14750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start += U16_LENGTH(c); 14850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))); 14950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // stop in incremental mode when we reach the input limit 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // in case there are additional characters that could change the 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // normalization result 15350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start=prev; 15450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 15650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fNorm2.normalize(segment, normalized, errorCode); 15750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_FAILURE(errorCode)) { 15850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 15950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 16050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(segment != normalized) { 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // replace the input chunk with its normalized form 16250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho text.handleReplaceBetween(prev, start, normalized); 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // update all necessary indexes accordingly 16550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t delta = normalized.length() - (start - prev); 16650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho start += delta; 16750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho limit += delta; 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } while(start < limit); 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru offsets.start = start; 17250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offsets.contextLimit += limit - offsets.limit; 17350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offsets.limit = limit; 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 179