1/* 2********************************************************************** 3* Copyright (C) 2001-2007, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* Date Name Description 7* 07/03/01 aliu Creation. 8********************************************************************** 9*/ 10 11#include "unicode/utypes.h" 12 13#if !UCONFIG_NO_TRANSLITERATION 14 15#include "unicode/uniset.h" 16#include "unicode/uiter.h" 17#include "nortrans.h" 18#include "unormimp.h" 19#include "ucln_in.h" 20 21U_NAMESPACE_BEGIN 22 23UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) 24 25/** 26 * System registration hook. 27 */ 28void NormalizationTransliterator::registerIDs() { 29 UErrorCode errorCode = U_ZERO_ERROR; 30 if(!unorm_haveData(&errorCode)) { 31 return; 32 } 33 34 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), 35 _create, integerToken(UNORM_NFC)); 36 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), 37 _create, integerToken(UNORM_NFKC)); 38 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), 39 _create, integerToken(UNORM_NFD)); 40 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), 41 _create, integerToken(UNORM_NFKD)); 42 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), 43 UNICODE_STRING_SIMPLE("NFD"), TRUE); 44 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), 45 UNICODE_STRING_SIMPLE("NFKD"), TRUE); 46} 47 48/** 49 * Factory methods 50 */ 51Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, 52 Token context) { 53 return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0); 54} 55 56/** 57 * Constructs a transliterator. 58 */ 59NormalizationTransliterator::NormalizationTransliterator( 60 const UnicodeString& id, 61 UNormalizationMode mode, int32_t opt) : 62 Transliterator(id, 0) { 63 fMode = mode; 64 options = opt; 65} 66 67/** 68 * Destructor. 69 */ 70NormalizationTransliterator::~NormalizationTransliterator() { 71} 72 73/** 74 * Copy constructor. 75 */ 76NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : 77Transliterator(o) { 78 fMode = o.fMode; 79 options = o.options; 80} 81 82/** 83 * Assignment operator. 84 */ 85/*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) { 86 Transliterator::operator=(o); 87 fMode = o.fMode; 88 options = o.options; 89 return *this; 90}*/ 91 92/** 93 * Transliterator API. 94 */ 95Transliterator* NormalizationTransliterator::clone(void) const { 96 return new NormalizationTransliterator(*this); 97} 98 99/** 100 * Implements {@link Transliterator#handleTransliterate}. 101 */ 102void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 103 UBool isIncremental) const { 104 // start and limit of the input range 105 int32_t start = offsets.start; 106 int32_t limit = offsets.limit; 107 int32_t length, delta; 108 109 if(start >= limit) { 110 return; 111 } 112 113 // a C code unit iterator, implemented around the Replaceable 114 UCharIterator iter; 115 uiter_setReplaceable(&iter, &text); 116 117 // the output string and buffer pointer 118 UnicodeString output; 119 UChar *buffer; 120 UBool neededToNormalize; 121 122 UErrorCode errorCode; 123 124 /* 125 * Normalize as short chunks at a time as possible even in 126 * bulk mode, so that styled text is minimally disrupted. 127 * In incremental mode, a chunk that ends with offsets.limit 128 * must not be normalized. 129 * 130 * If it was known that the input text is not styled, then 131 * a bulk mode normalization could look like this: 132 * 133 134 UChar staticChars[256]; 135 UnicodeString input; 136 137 length = limit - start; 138 input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias 139 140 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); 141 input.releaseBuffer(length); 142 143 UErrorCode status = U_ZERO_ERROR; 144 Normalizer::normalize(input, fMode, options, output, status); 145 146 text.handleReplaceBetween(start, limit, output); 147 148 int32_t delta = output.length() - length; 149 offsets.contextLimit += delta; 150 offsets.limit += delta; 151 offsets.start = limit + delta; 152 153 * 154 */ 155 while(start < limit) { 156 // set the iterator limits for the remaining input range 157 // this is a moving target because of the replacements in the text object 158 iter.start = iter.index = start; 159 iter.limit = limit; 160 161 // incrementally normalize a small chunk of the input 162 buffer = output.getBuffer(-1); 163 errorCode = U_ZERO_ERROR; 164 length = unorm_next(&iter, buffer, output.getCapacity(), 165 fMode, 0, 166 TRUE, &neededToNormalize, 167 &errorCode); 168 output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); 169 170 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 171 // use a larger output string buffer and do it again from the start 172 iter.index = start; 173 buffer = output.getBuffer(length); 174 errorCode = U_ZERO_ERROR; 175 length = unorm_next(&iter, buffer, output.getCapacity(), 176 fMode, 0, 177 TRUE, &neededToNormalize, 178 &errorCode); 179 output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); 180 } 181 182 if(U_FAILURE(errorCode)) { 183 break; 184 } 185 186 limit = iter.index; 187 if(isIncremental && limit == iter.limit) { 188 // stop in incremental mode when we reach the input limit 189 // in case there are additional characters that could change the 190 // normalization result 191 192 // UNLESS all characters in the result of the normalization of 193 // the last run are in the skippable set 194 const UChar *s=output.getBuffer(); 195 int32_t i=0, outLength=output.length(); 196 UChar32 c; 197 198 while(i<outLength) { 199 U16_NEXT(s, i, outLength, c); 200 if(!unorm_isNFSkippable(c, fMode)) { 201 outLength=-1; // I wish C++ had labeled loops and break outer; ... 202 break; 203 } 204 } 205 if (outLength<0) { 206 break; 207 } 208 } 209 210 if(neededToNormalize) { 211 // replace the input chunk with its normalized form 212 text.handleReplaceBetween(start, limit, output); 213 214 // update all necessary indexes accordingly 215 delta = length - (limit - start); // length change in the text object 216 start = limit += delta; // the next chunk starts where this one ends, with adjustment 217 limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range 218 offsets.contextLimit += delta; 219 } else { 220 // delta == 0 221 start = limit; 222 limit = offsets.limit; 223 } 224 } 225 226 offsets.start = start; 227} 228 229U_NAMESPACE_END 230 231#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 232