1/* 2 ************************************************************************* 3 * COPYRIGHT: 4 * Copyright (c) 1996-2010, International Business Machines Corporation and 5 * others. All Rights Reserved. 6 ************************************************************************* 7 */ 8 9#include "unicode/utypes.h" 10 11#if !UCONFIG_NO_NORMALIZATION 12 13#include "unicode/uniset.h" 14#include "unicode/unistr.h" 15#include "unicode/chariter.h" 16#include "unicode/schriter.h" 17#include "unicode/uchriter.h" 18#include "unicode/normlzr.h" 19#include "cmemory.h" 20#include "normalizer2impl.h" 21#include "uprops.h" // for uniset_getUnicode32Instance() 22 23U_NAMESPACE_BEGIN 24 25UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) 26 27//------------------------------------------------------------------------- 28// Constructors and other boilerplate 29//------------------------------------------------------------------------- 30 31Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : 32 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 33 text(new StringCharacterIterator(str)), 34 currentIndex(0), nextIndex(0), 35 buffer(), bufferPos(0) 36{ 37 init(); 38} 39 40Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : 41 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 42 text(new UCharCharacterIterator(str, length)), 43 currentIndex(0), nextIndex(0), 44 buffer(), bufferPos(0) 45{ 46 init(); 47} 48 49Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : 50 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 51 text(iter.clone()), 52 currentIndex(0), nextIndex(0), 53 buffer(), bufferPos(0) 54{ 55 init(); 56} 57 58Normalizer::Normalizer(const Normalizer ©) : 59 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), 60 text(copy.text->clone()), 61 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), 62 buffer(copy.buffer), bufferPos(copy.bufferPos) 63{ 64 init(); 65} 66 67static const UChar _NUL=0; 68 69void 70Normalizer::init() { 71 UErrorCode errorCode=U_ZERO_ERROR; 72 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); 73 if(fOptions&UNORM_UNICODE_3_2) { 74 delete fFilteredNorm2; 75 fNorm2=fFilteredNorm2= 76 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); 77 } 78 if(U_FAILURE(errorCode)) { 79 errorCode=U_ZERO_ERROR; 80 fNorm2=Normalizer2Factory::getNoopInstance(errorCode); 81 } 82} 83 84Normalizer::~Normalizer() 85{ 86 delete fFilteredNorm2; 87 delete text; 88} 89 90Normalizer* 91Normalizer::clone() const 92{ 93 return new Normalizer(*this); 94} 95 96/** 97 * Generates a hash code for this iterator. 98 */ 99int32_t Normalizer::hashCode() const 100{ 101 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; 102} 103 104UBool Normalizer::operator==(const Normalizer& that) const 105{ 106 return 107 this==&that || 108 (fUMode==that.fUMode && 109 fOptions==that.fOptions && 110 *text==*that.text && 111 buffer==that.buffer && 112 bufferPos==that.bufferPos && 113 nextIndex==that.nextIndex); 114} 115 116//------------------------------------------------------------------------- 117// Static utility methods 118//------------------------------------------------------------------------- 119 120void U_EXPORT2 121Normalizer::normalize(const UnicodeString& source, 122 UNormalizationMode mode, int32_t options, 123 UnicodeString& result, 124 UErrorCode &status) { 125 if(source.isBogus() || U_FAILURE(status)) { 126 result.setToBogus(); 127 if(U_SUCCESS(status)) { 128 status=U_ILLEGAL_ARGUMENT_ERROR; 129 } 130 } else { 131 UnicodeString localDest; 132 UnicodeString *dest; 133 134 if(&source!=&result) { 135 dest=&result; 136 } else { 137 // the source and result strings are the same object, use a temporary one 138 dest=&localDest; 139 } 140 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 141 if(U_SUCCESS(status)) { 142 if(options&UNORM_UNICODE_3_2) { 143 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 144 normalize(source, *dest, status); 145 } else { 146 n2->normalize(source, *dest, status); 147 } 148 } 149 if(dest==&localDest && U_SUCCESS(status)) { 150 result=*dest; 151 } 152 } 153} 154 155void U_EXPORT2 156Normalizer::compose(const UnicodeString& source, 157 UBool compat, int32_t options, 158 UnicodeString& result, 159 UErrorCode &status) { 160 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); 161} 162 163void U_EXPORT2 164Normalizer::decompose(const UnicodeString& source, 165 UBool compat, int32_t options, 166 UnicodeString& result, 167 UErrorCode &status) { 168 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); 169} 170 171UNormalizationCheckResult 172Normalizer::quickCheck(const UnicodeString& source, 173 UNormalizationMode mode, int32_t options, 174 UErrorCode &status) { 175 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 176 if(U_SUCCESS(status)) { 177 if(options&UNORM_UNICODE_3_2) { 178 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 179 quickCheck(source, status); 180 } else { 181 return n2->quickCheck(source, status); 182 } 183 } else { 184 return UNORM_MAYBE; 185 } 186} 187 188UBool 189Normalizer::isNormalized(const UnicodeString& source, 190 UNormalizationMode mode, int32_t options, 191 UErrorCode &status) { 192 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 193 if(U_SUCCESS(status)) { 194 if(options&UNORM_UNICODE_3_2) { 195 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 196 isNormalized(source, status); 197 } else { 198 return n2->isNormalized(source, status); 199 } 200 } else { 201 return FALSE; 202 } 203} 204 205UnicodeString & U_EXPORT2 206Normalizer::concatenate(UnicodeString &left, UnicodeString &right, 207 UnicodeString &result, 208 UNormalizationMode mode, int32_t options, 209 UErrorCode &errorCode) { 210 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { 211 result.setToBogus(); 212 if(U_SUCCESS(errorCode)) { 213 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 214 } 215 } else { 216 UnicodeString localDest; 217 UnicodeString *dest; 218 219 if(&right!=&result) { 220 dest=&result; 221 } else { 222 // the right and result strings are the same object, use a temporary one 223 dest=&localDest; 224 } 225 *dest=left; 226 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); 227 if(U_SUCCESS(errorCode)) { 228 if(options&UNORM_UNICODE_3_2) { 229 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). 230 append(*dest, right, errorCode); 231 } else { 232 n2->append(*dest, right, errorCode); 233 } 234 } 235 if(dest==&localDest && U_SUCCESS(errorCode)) { 236 result=*dest; 237 } 238 } 239 return result; 240} 241 242//------------------------------------------------------------------------- 243// Iteration API 244//------------------------------------------------------------------------- 245 246/** 247 * Return the current character in the normalized text. 248 */ 249UChar32 Normalizer::current() { 250 if(bufferPos<buffer.length() || nextNormalize()) { 251 return buffer.char32At(bufferPos); 252 } else { 253 return DONE; 254 } 255} 256 257/** 258 * Return the next character in the normalized text and advance 259 * the iteration position by one. If the end 260 * of the text has already been reached, {@link #DONE} is returned. 261 */ 262UChar32 Normalizer::next() { 263 if(bufferPos<buffer.length() || nextNormalize()) { 264 UChar32 c=buffer.char32At(bufferPos); 265 bufferPos+=UTF_CHAR_LENGTH(c); 266 return c; 267 } else { 268 return DONE; 269 } 270} 271 272/** 273 * Return the previous character in the normalized text and decrement 274 * the iteration position by one. If the beginning 275 * of the text has already been reached, {@link #DONE} is returned. 276 */ 277UChar32 Normalizer::previous() { 278 if(bufferPos>0 || previousNormalize()) { 279 UChar32 c=buffer.char32At(bufferPos-1); 280 bufferPos-=UTF_CHAR_LENGTH(c); 281 return c; 282 } else { 283 return DONE; 284 } 285} 286 287void Normalizer::reset() { 288 currentIndex=nextIndex=text->setToStart(); 289 clearBuffer(); 290} 291 292void 293Normalizer::setIndexOnly(int32_t index) { 294 text->setIndex(index); // pins index 295 currentIndex=nextIndex=text->getIndex(); 296 clearBuffer(); 297} 298 299/** 300 * Return the first character in the normalized text. This resets 301 * the <tt>Normalizer's</tt> position to the beginning of the text. 302 */ 303UChar32 Normalizer::first() { 304 reset(); 305 return next(); 306} 307 308/** 309 * Return the last character in the normalized text. This resets 310 * the <tt>Normalizer's</tt> position to be just before the 311 * the input text corresponding to that normalized character. 312 */ 313UChar32 Normalizer::last() { 314 currentIndex=nextIndex=text->setToEnd(); 315 clearBuffer(); 316 return previous(); 317} 318 319/** 320 * Retrieve the current iteration position in the input text that is 321 * being normalized. This method is useful in applications such as 322 * searching, where you need to be able to determine the position in 323 * the input text that corresponds to a given normalized output character. 324 * <p> 325 * <b>Note:</b> This method sets the position in the <em>input</em>, while 326 * {@link #next} and {@link #previous} iterate through characters in the 327 * <em>output</em>. This means that there is not necessarily a one-to-one 328 * correspondence between characters returned by <tt>next</tt> and 329 * <tt>previous</tt> and the indices passed to and returned from 330 * <tt>setIndex</tt> and {@link #getIndex}. 331 * 332 */ 333int32_t Normalizer::getIndex() const { 334 if(bufferPos<buffer.length()) { 335 return currentIndex; 336 } else { 337 return nextIndex; 338 } 339} 340 341/** 342 * Retrieve the index of the start of the input text. This is the begin index 343 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> 344 * over which this <tt>Normalizer</tt> is iterating 345 */ 346int32_t Normalizer::startIndex() const { 347 return text->startIndex(); 348} 349 350/** 351 * Retrieve the index of the end of the input text. This is the end index 352 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 353 * over which this <tt>Normalizer</tt> is iterating 354 */ 355int32_t Normalizer::endIndex() const { 356 return text->endIndex(); 357} 358 359//------------------------------------------------------------------------- 360// Property access methods 361//------------------------------------------------------------------------- 362 363void 364Normalizer::setMode(UNormalizationMode newMode) 365{ 366 fUMode = newMode; 367 init(); 368} 369 370UNormalizationMode 371Normalizer::getUMode() const 372{ 373 return fUMode; 374} 375 376void 377Normalizer::setOption(int32_t option, 378 UBool value) 379{ 380 if (value) { 381 fOptions |= option; 382 } else { 383 fOptions &= (~option); 384 } 385 init(); 386} 387 388UBool 389Normalizer::getOption(int32_t option) const 390{ 391 return (fOptions & option) != 0; 392} 393 394/** 395 * Set the input text over which this <tt>Normalizer</tt> will iterate. 396 * The iteration position is set to the beginning of the input text. 397 */ 398void 399Normalizer::setText(const UnicodeString& newText, 400 UErrorCode &status) 401{ 402 if (U_FAILURE(status)) { 403 return; 404 } 405 CharacterIterator *newIter = new StringCharacterIterator(newText); 406 if (newIter == NULL) { 407 status = U_MEMORY_ALLOCATION_ERROR; 408 return; 409 } 410 delete text; 411 text = newIter; 412 reset(); 413} 414 415/** 416 * Set the input text over which this <tt>Normalizer</tt> will iterate. 417 * The iteration position is set to the beginning of the string. 418 */ 419void 420Normalizer::setText(const CharacterIterator& newText, 421 UErrorCode &status) 422{ 423 if (U_FAILURE(status)) { 424 return; 425 } 426 CharacterIterator *newIter = newText.clone(); 427 if (newIter == NULL) { 428 status = U_MEMORY_ALLOCATION_ERROR; 429 return; 430 } 431 delete text; 432 text = newIter; 433 reset(); 434} 435 436void 437Normalizer::setText(const UChar* newText, 438 int32_t length, 439 UErrorCode &status) 440{ 441 if (U_FAILURE(status)) { 442 return; 443 } 444 CharacterIterator *newIter = new UCharCharacterIterator(newText, length); 445 if (newIter == NULL) { 446 status = U_MEMORY_ALLOCATION_ERROR; 447 return; 448 } 449 delete text; 450 text = newIter; 451 reset(); 452} 453 454/** 455 * Copies the text under iteration into the UnicodeString referred to by "result". 456 * @param result Receives a copy of the text under iteration. 457 */ 458void 459Normalizer::getText(UnicodeString& result) 460{ 461 text->getText(result); 462} 463 464//------------------------------------------------------------------------- 465// Private utility methods 466//------------------------------------------------------------------------- 467 468void Normalizer::clearBuffer() { 469 buffer.remove(); 470 bufferPos=0; 471} 472 473UBool 474Normalizer::nextNormalize() { 475 clearBuffer(); 476 currentIndex=nextIndex; 477 text->setIndex(nextIndex); 478 if(!text->hasNext()) { 479 return FALSE; 480 } 481 // Skip at least one character so we make progress. 482 UnicodeString segment(text->next32PostInc()); 483 while(text->hasNext()) { 484 UChar32 c; 485 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { 486 text->move32(-1, CharacterIterator::kCurrent); 487 break; 488 } 489 segment.append(c); 490 } 491 nextIndex=text->getIndex(); 492 UErrorCode errorCode=U_ZERO_ERROR; 493 fNorm2->normalize(segment, buffer, errorCode); 494 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 495} 496 497UBool 498Normalizer::previousNormalize() { 499 clearBuffer(); 500 nextIndex=currentIndex; 501 text->setIndex(currentIndex); 502 if(!text->hasPrevious()) { 503 return FALSE; 504 } 505 UnicodeString segment; 506 while(text->hasPrevious()) { 507 UChar32 c=text->previous32(); 508 segment.insert(0, c); 509 if(fNorm2->hasBoundaryBefore(c)) { 510 break; 511 } 512 } 513 currentIndex=text->getIndex(); 514 UErrorCode errorCode=U_ZERO_ERROR; 515 fNorm2->normalize(segment, buffer, errorCode); 516 bufferPos=buffer.length(); 517 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 518} 519 520U_NAMESPACE_END 521 522#endif /* #if !UCONFIG_NO_NORMALIZATION */ 523