1/* 2 ************************************************************************* 3 * COPYRIGHT: 4 * Copyright (c) 1996-2005, International Business Machines Corporation and 5 * others. All Rights Reserved. 6 ************************************************************************* 7 */ 8 9#include "unicode/utypes.h" 10 11#if !UCONFIG_NO_NORMALIZATION 12 13#include "unicode/unistr.h" 14#include "unicode/chariter.h" 15#include "unicode/schriter.h" 16#include "unicode/uchriter.h" 17#include "unicode/uiter.h" 18#include "unicode/normlzr.h" 19#include "cmemory.h" 20#include "unormimp.h" 21 22U_NAMESPACE_BEGIN 23 24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) 25 26//------------------------------------------------------------------------- 27// Constructors and other boilerplate 28//------------------------------------------------------------------------- 29 30Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : 31 UObject(), fUMode(mode), fOptions(0), 32 currentIndex(0), nextIndex(0), 33 buffer(), bufferPos(0) 34{ 35 init(new StringCharacterIterator(str)); 36} 37 38Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : 39 UObject(), fUMode(mode), fOptions(0), 40 currentIndex(0), nextIndex(0), 41 buffer(), bufferPos(0) 42{ 43 init(new UCharCharacterIterator(str, length)); 44} 45 46Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : 47 UObject(), fUMode(mode), fOptions(0), 48 currentIndex(0), nextIndex(0), 49 buffer(), bufferPos(0) 50{ 51 init(iter.clone()); 52} 53 54Normalizer::Normalizer(const Normalizer ©) : 55 UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions), 56 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), 57 buffer(copy.buffer), bufferPos(copy.bufferPos) 58{ 59 init(((CharacterIterator *)(copy.text->context))->clone()); 60} 61 62static const UChar _NUL=0; 63 64void 65Normalizer::init(CharacterIterator *iter) { 66 UErrorCode errorCode=U_ZERO_ERROR; 67 68 text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator)); 69 if(text!=NULL) { 70 if(unorm_haveData(&errorCode)) { 71 uiter_setCharacterIterator(text, iter); 72 } else { 73 delete iter; 74 uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0)); 75 } 76 } else { 77 delete iter; 78 } 79} 80 81Normalizer::~Normalizer() 82{ 83 if(text!=NULL) { 84 delete (CharacterIterator *)text->context; 85 uprv_free(text); 86 } 87} 88 89Normalizer* 90Normalizer::clone() const 91{ 92 if(this!=0) { 93 return new Normalizer(*this); 94 } else { 95 return 0; 96 } 97} 98 99/** 100 * Generates a hash code for this iterator. 101 */ 102int32_t Normalizer::hashCode() const 103{ 104 return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; 105} 106 107UBool Normalizer::operator==(const Normalizer& that) const 108{ 109 return 110 this==&that || 111 fUMode==that.fUMode && 112 fOptions==that.fOptions && 113 *((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) && 114 buffer==that.buffer && 115 bufferPos==that.bufferPos && 116 nextIndex==that.nextIndex; 117} 118 119//------------------------------------------------------------------------- 120// Static utility methods 121//------------------------------------------------------------------------- 122 123void U_EXPORT2 124Normalizer::normalize(const UnicodeString& source, 125 UNormalizationMode mode, int32_t options, 126 UnicodeString& result, 127 UErrorCode &status) { 128 if(source.isBogus() || U_FAILURE(status)) { 129 result.setToBogus(); 130 if(U_SUCCESS(status)) { 131 status=U_ILLEGAL_ARGUMENT_ERROR; 132 } 133 } else { 134 UnicodeString localDest; 135 UnicodeString *dest; 136 137 if(&source!=&result) { 138 dest=&result; 139 } else { 140 // the source and result strings are the same object, use a temporary one 141 dest=&localDest; 142 } 143 144 UChar *buffer=dest->getBuffer(source.length()); 145 int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(), 146 source.getBuffer(), source.length(), 147 mode, options, 148 &status); 149 dest->releaseBuffer(U_SUCCESS(status) ? length : 0); 150 if(status==U_BUFFER_OVERFLOW_ERROR) { 151 status=U_ZERO_ERROR; 152 buffer=dest->getBuffer(length); 153 length=unorm_internalNormalize(buffer, dest->getCapacity(), 154 source.getBuffer(), source.length(), 155 mode, options, 156 &status); 157 dest->releaseBuffer(U_SUCCESS(status) ? length : 0); 158 } 159 160 if(dest==&localDest) { 161 result=*dest; 162 } 163 if(U_FAILURE(status)) { 164 result.setToBogus(); 165 } 166 } 167} 168 169void U_EXPORT2 170Normalizer::compose(const UnicodeString& source, 171 UBool compat, int32_t options, 172 UnicodeString& result, 173 UErrorCode &status) { 174 if(source.isBogus() || U_FAILURE(status)) { 175 result.setToBogus(); 176 if(U_SUCCESS(status)) { 177 status=U_ILLEGAL_ARGUMENT_ERROR; 178 } 179 } else { 180 UnicodeString localDest; 181 UnicodeString *dest; 182 183 if(&source!=&result) { 184 dest=&result; 185 } else { 186 // the source and result strings are the same object, use a temporary one 187 dest=&localDest; 188 } 189 190 UChar *buffer=dest->getBuffer(source.length()); 191 int32_t length=unorm_compose(buffer, dest->getCapacity(), 192 source.getBuffer(), source.length(), 193 compat, options, 194 &status); 195 dest->releaseBuffer(U_SUCCESS(status) ? length : 0); 196 if(status==U_BUFFER_OVERFLOW_ERROR) { 197 status=U_ZERO_ERROR; 198 buffer=dest->getBuffer(length); 199 length=unorm_compose(buffer, dest->getCapacity(), 200 source.getBuffer(), source.length(), 201 compat, options, 202 &status); 203 dest->releaseBuffer(U_SUCCESS(status) ? length : 0); 204 } 205 206 if(dest==&localDest) { 207 result=*dest; 208 } 209 if(U_FAILURE(status)) { 210 result.setToBogus(); 211 } 212 } 213} 214 215void U_EXPORT2 216Normalizer::decompose(const UnicodeString& source, 217 UBool compat, int32_t options, 218 UnicodeString& result, 219 UErrorCode &status) { 220 if(source.isBogus() || U_FAILURE(status)) { 221 result.setToBogus(); 222 if(U_SUCCESS(status)) { 223 status=U_ILLEGAL_ARGUMENT_ERROR; 224 } 225 } else { 226 UnicodeString localDest; 227 UnicodeString *dest; 228 229 if(&source!=&result) { 230 dest=&result; 231 } else { 232 // the source and result strings are the same object, use a temporary one 233 dest=&localDest; 234 } 235 236 UChar *buffer=dest->getBuffer(source.length()); 237 int32_t length=unorm_decompose(buffer, dest->getCapacity(), 238 source.getBuffer(), source.length(), 239 compat, options, 240 &status); 241 dest->releaseBuffer(U_SUCCESS(status) ? length : 0); 242 if(status==U_BUFFER_OVERFLOW_ERROR) { 243 status=U_ZERO_ERROR; 244 buffer=dest->getBuffer(length); 245 length=unorm_decompose(buffer, dest->getCapacity(), 246 source.getBuffer(), source.length(), 247 compat, options, 248 &status); 249 dest->releaseBuffer(U_SUCCESS(status) ? length : 0); 250 } 251 252 if(dest==&localDest) { 253 result=*dest; 254 } 255 if(U_FAILURE(status)) { 256 result.setToBogus(); 257 } 258 } 259} 260 261UnicodeString & U_EXPORT2 262Normalizer::concatenate(UnicodeString &left, UnicodeString &right, 263 UnicodeString &result, 264 UNormalizationMode mode, int32_t options, 265 UErrorCode &errorCode) { 266 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { 267 result.setToBogus(); 268 if(U_SUCCESS(errorCode)) { 269 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 270 } 271 } else { 272 UnicodeString localDest; 273 UnicodeString *dest; 274 275 if(&left!=&result && &right!=&result) { 276 dest=&result; 277 } else { 278 // the source and result strings are the same object, use a temporary one 279 dest=&localDest; 280 } 281 282 UChar *buffer=dest->getBuffer(left.length()+right.length()); 283 int32_t length=unorm_concatenate(left.getBuffer(), left.length(), 284 right.getBuffer(), right.length(), 285 buffer, dest->getCapacity(), 286 mode, options, 287 &errorCode); 288 dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0); 289 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 290 errorCode=U_ZERO_ERROR; 291 buffer=dest->getBuffer(length); 292 int32_t length=unorm_concatenate(left.getBuffer(), left.length(), 293 right.getBuffer(), right.length(), 294 buffer, dest->getCapacity(), 295 mode, options, 296 &errorCode); 297 dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0); 298 } 299 300 if(dest==&localDest) { 301 result=*dest; 302 } 303 if(U_FAILURE(errorCode)) { 304 result.setToBogus(); 305 } 306 } 307 return result; 308} 309 310//------------------------------------------------------------------------- 311// Iteration API 312//------------------------------------------------------------------------- 313 314/** 315 * Return the current character in the normalized text. 316 */ 317UChar32 Normalizer::current() { 318 if(bufferPos<buffer.length() || nextNormalize()) { 319 return buffer.char32At(bufferPos); 320 } else { 321 return DONE; 322 } 323} 324 325/** 326 * Return the next character in the normalized text and advance 327 * the iteration position by one. If the end 328 * of the text has already been reached, {@link #DONE} is returned. 329 */ 330UChar32 Normalizer::next() { 331 if(bufferPos<buffer.length() || nextNormalize()) { 332 UChar32 c=buffer.char32At(bufferPos); 333 bufferPos+=UTF_CHAR_LENGTH(c); 334 return c; 335 } else { 336 return DONE; 337 } 338} 339 340/** 341 * Return the previous character in the normalized text and decrement 342 * the iteration position by one. If the beginning 343 * of the text has already been reached, {@link #DONE} is returned. 344 */ 345UChar32 Normalizer::previous() { 346 if(bufferPos>0 || previousNormalize()) { 347 UChar32 c=buffer.char32At(bufferPos-1); 348 bufferPos-=UTF_CHAR_LENGTH(c); 349 return c; 350 } else { 351 return DONE; 352 } 353} 354 355void Normalizer::reset() { 356 currentIndex=nextIndex=text->move(text, 0, UITER_START); 357 clearBuffer(); 358} 359 360void 361Normalizer::setIndexOnly(int32_t index) { 362 currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index 363 clearBuffer(); 364} 365 366/** 367 * Return the first character in the normalized text-> This resets 368 * the <tt>Normalizer's</tt> position to the beginning of the text-> 369 */ 370UChar32 Normalizer::first() { 371 reset(); 372 return next(); 373} 374 375/** 376 * Return the last character in the normalized text-> This resets 377 * the <tt>Normalizer's</tt> position to be just before the 378 * the input text corresponding to that normalized character. 379 */ 380UChar32 Normalizer::last() { 381 currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT); 382 clearBuffer(); 383 return previous(); 384} 385 386/** 387 * Retrieve the current iteration position in the input text that is 388 * being normalized. This method is useful in applications such as 389 * searching, where you need to be able to determine the position in 390 * the input text that corresponds to a given normalized output character. 391 * <p> 392 * <b>Note:</b> This method sets the position in the <em>input</em>, while 393 * {@link #next} and {@link #previous} iterate through characters in the 394 * <em>output</em>. This means that there is not necessarily a one-to-one 395 * correspondence between characters returned by <tt>next</tt> and 396 * <tt>previous</tt> and the indices passed to and returned from 397 * <tt>setIndex</tt> and {@link #getIndex}. 398 * 399 */ 400int32_t Normalizer::getIndex() const { 401 if(bufferPos<buffer.length()) { 402 return currentIndex; 403 } else { 404 return nextIndex; 405 } 406} 407 408/** 409 * Retrieve the index of the start of the input text-> This is the begin index 410 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> 411 * over which this <tt>Normalizer</tt> is iterating 412 */ 413int32_t Normalizer::startIndex() const { 414 return text->getIndex(text, UITER_START); 415} 416 417/** 418 * Retrieve the index of the end of the input text-> This is the end index 419 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 420 * over which this <tt>Normalizer</tt> is iterating 421 */ 422int32_t Normalizer::endIndex() const { 423 return text->getIndex(text, UITER_LIMIT); 424} 425 426//------------------------------------------------------------------------- 427// Property access methods 428//------------------------------------------------------------------------- 429 430void 431Normalizer::setMode(UNormalizationMode newMode) 432{ 433 fUMode = newMode; 434} 435 436UNormalizationMode 437Normalizer::getUMode() const 438{ 439 return fUMode; 440} 441 442void 443Normalizer::setOption(int32_t option, 444 UBool value) 445{ 446 if (value) { 447 fOptions |= option; 448 } else { 449 fOptions &= (~option); 450 } 451} 452 453UBool 454Normalizer::getOption(int32_t option) const 455{ 456 return (fOptions & option) != 0; 457} 458 459/** 460 * Set the input text over which this <tt>Normalizer</tt> will iterate. 461 * The iteration position is set to the beginning of the input text-> 462 */ 463void 464Normalizer::setText(const UnicodeString& newText, 465 UErrorCode &status) 466{ 467 if (U_FAILURE(status)) { 468 return; 469 } 470 CharacterIterator *newIter = new StringCharacterIterator(newText); 471 if (newIter == NULL) { 472 status = U_MEMORY_ALLOCATION_ERROR; 473 return; 474 } 475 delete (CharacterIterator *)(text->context); 476 text->context = newIter; 477 reset(); 478} 479 480/** 481 * Set the input text over which this <tt>Normalizer</tt> will iterate. 482 * The iteration position is set to the beginning of the string. 483 */ 484void 485Normalizer::setText(const CharacterIterator& newText, 486 UErrorCode &status) 487{ 488 if (U_FAILURE(status)) { 489 return; 490 } 491 CharacterIterator *newIter = newText.clone(); 492 if (newIter == NULL) { 493 status = U_MEMORY_ALLOCATION_ERROR; 494 return; 495 } 496 delete (CharacterIterator *)(text->context); 497 text->context = newIter; 498 reset(); 499} 500 501void 502Normalizer::setText(const UChar* newText, 503 int32_t length, 504 UErrorCode &status) 505{ 506 if (U_FAILURE(status)) { 507 return; 508 } 509 CharacterIterator *newIter = new UCharCharacterIterator(newText, length); 510 if (newIter == NULL) { 511 status = U_MEMORY_ALLOCATION_ERROR; 512 return; 513 } 514 delete (CharacterIterator *)(text->context); 515 text->context = newIter; 516 reset(); 517} 518 519/** 520 * Copies the text under iteration into the UnicodeString referred to by "result". 521 * @param result Receives a copy of the text under iteration. 522 */ 523void 524Normalizer::getText(UnicodeString& result) 525{ 526 ((CharacterIterator *)(text->context))->getText(result); 527} 528 529//------------------------------------------------------------------------- 530// Private utility methods 531//------------------------------------------------------------------------- 532 533void Normalizer::clearBuffer() { 534 buffer.remove(); 535 bufferPos=0; 536} 537 538UBool 539Normalizer::nextNormalize() { 540 UChar *p; 541 int32_t length; 542 UErrorCode errorCode; 543 544 clearBuffer(); 545 currentIndex=nextIndex; 546 text->move(text, nextIndex, UITER_ZERO); 547 if(!text->hasNext(text)) { 548 return FALSE; 549 } 550 551 errorCode=U_ZERO_ERROR; 552 p=buffer.getBuffer(-1); 553 length=unorm_next(text, p, buffer.getCapacity(), 554 fUMode, fOptions, 555 TRUE, 0, 556 &errorCode); 557 buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); 558 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 559 errorCode=U_ZERO_ERROR; 560 text->move(text, nextIndex, UITER_ZERO); 561 p=buffer.getBuffer(length); 562 length=unorm_next(text, p, buffer.getCapacity(), 563 fUMode, fOptions, 564 TRUE, 0, 565 &errorCode); 566 buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); 567 } 568 569 nextIndex=text->getIndex(text, UITER_CURRENT); 570 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 571} 572 573UBool 574Normalizer::previousNormalize() { 575 UChar *p; 576 int32_t length; 577 UErrorCode errorCode; 578 579 clearBuffer(); 580 nextIndex=currentIndex; 581 text->move(text, currentIndex, UITER_ZERO); 582 if(!text->hasPrevious(text)) { 583 return FALSE; 584 } 585 586 errorCode=U_ZERO_ERROR; 587 p=buffer.getBuffer(-1); 588 length=unorm_previous(text, p, buffer.getCapacity(), 589 fUMode, fOptions, 590 TRUE, 0, 591 &errorCode); 592 buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); 593 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 594 errorCode=U_ZERO_ERROR; 595 text->move(text, currentIndex, UITER_ZERO); 596 p=buffer.getBuffer(length); 597 length=unorm_previous(text, p, buffer.getCapacity(), 598 fUMode, fOptions, 599 TRUE, 0, 600 &errorCode); 601 buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); 602 } 603 604 bufferPos=buffer.length(); 605 currentIndex=text->getIndex(text, UITER_CURRENT); 606 return U_SUCCESS(errorCode) && !buffer.isEmpty(); 607} 608 609U_NAMESPACE_END 610 611#endif /* #if !UCONFIG_NO_NORMALIZATION */ 612