1/* 2 ******************************************************************** 3 * COPYRIGHT: 4 * Copyright (c) 1996-2006, International Business Machines Corporation and 5 * others. All Rights Reserved. 6 ******************************************************************** 7 */ 8 9#ifndef NORMLZR_H 10#define NORMLZR_H 11 12#include "unicode/utypes.h" 13 14/** 15 * \file 16 * \brief C++ API: Unicode Normalization 17 */ 18 19#if !UCONFIG_NO_NORMALIZATION 20 21#include "unicode/uobject.h" 22#include "unicode/unistr.h" 23#include "unicode/chariter.h" 24#include "unicode/unorm.h" 25 26 27struct UCharIterator; 28typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */ 29 30U_NAMESPACE_BEGIN 31/** 32 * The Normalizer class supports the standard normalization forms described in 33 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 34 * Unicode Standard Annex #15: Unicode Normalization Forms</a>. 35 * 36 * The Normalizer class consists of two parts: 37 * - static functions that normalize strings or test if strings are normalized 38 * - a Normalizer object is an iterator that takes any kind of text and 39 * provides iteration over its normalized form 40 * 41 * The Normalizer class is not suitable for subclassing. 42 * 43 * The static functions are basically wrappers around the C implementation, 44 * using UnicodeString instead of UChar*. 45 * For basic information about normalization forms and details about the C API 46 * please see the documentation in unorm.h. 47 * 48 * The iterator API with the Normalizer constructors and the non-static functions 49 * uses a CharacterIterator as input. It is possible to pass a string which 50 * is then internally wrapped in a CharacterIterator. 51 * The input text is not normalized all at once, but incrementally where needed 52 * (providing efficient random access). 53 * This allows to pass in a large text but spend only a small amount of time 54 * normalizing a small part of that text. 55 * However, if the entire text is normalized, then the iterator will be 56 * slower than normalizing the entire text at once and iterating over the result. 57 * A possible use of the Normalizer iterator is also to report an index into the 58 * original text that is close to where the normalized characters come from. 59 * 60 * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0. 61 * The earlier implementation reported the getIndex() inconsistently, 62 * and previous() could not be used after setIndex(), next(), first(), and current(). 63 * 64 * Normalizer allows to start normalizing from anywhere in the input text by 65 * calling setIndexOnly(), first(), or last(). 66 * Without calling any of these, the iterator will start at the beginning of the text. 67 * 68 * At any time, next() returns the next normalized code point (UChar32), 69 * with post-increment semantics (like CharacterIterator::next32PostInc()). 70 * previous() returns the previous normalized code point (UChar32), 71 * with pre-decrement semantics (like CharacterIterator::previous32()). 72 * 73 * current() returns the current code point 74 * (respectively the one at the newly set index) without moving 75 * the getIndex(). Note that if the text at the current position 76 * needs to be normalized, then these functions will do that. 77 * (This is why current() is not const.) 78 * It is more efficient to call setIndexOnly() instead, which does not 79 * normalize. 80 * 81 * getIndex() always refers to the position in the input text where the normalized 82 * code points are returned from. It does not always change with each returned 83 * code point. 84 * The code point that is returned from any of the functions 85 * corresponds to text at or after getIndex(), according to the 86 * function's iteration semantics (post-increment or pre-decrement). 87 * 88 * next() returns a code point from at or after the getIndex() 89 * from before the next() call. After the next() call, the getIndex() 90 * might have moved to where the next code point will be returned from 91 * (from a next() or current() call). 92 * This is semantically equivalent to array access with array[index++] 93 * (post-increment semantics). 94 * 95 * previous() returns a code point from at or after the getIndex() 96 * from after the previous() call. 97 * This is semantically equivalent to array access with array[--index] 98 * (pre-decrement semantics). 99 * 100 * Internally, the Normalizer iterator normalizes a small piece of text 101 * starting at the getIndex() and ending at a following "safe" index. 102 * The normalized results is stored in an internal string buffer, and 103 * the code points are iterated from there. 104 * With multiple iteration calls, this is repeated until the next piece 105 * of text needs to be normalized, and the getIndex() needs to be moved. 106 * 107 * The following "safe" index, the internal buffer, and the secondary 108 * iteration index into that buffer are not exposed on the API. 109 * This also means that it is currently not practical to return to 110 * a particular, arbitrary position in the text because one would need to 111 * know, and be able to set, in addition to the getIndex(), at least also the 112 * current index into the internal buffer. 113 * It is currently only possible to observe when getIndex() changes 114 * (with careful consideration of the iteration semantics), 115 * at which time the internal index will be 0. 116 * For example, if getIndex() is different after next() than before it, 117 * then the internal index is 0 and one can return to this getIndex() 118 * later with setIndexOnly(). 119 * 120 * @author Laura Werner, Mark Davis, Markus Scherer 121 * @stable ICU 2.0 122 */ 123class U_COMMON_API Normalizer : public UObject { 124public: 125 /** 126 * If DONE is returned from an iteration function that returns a code point, 127 * then there are no more normalization results available. 128 * @stable ICU 2.0 129 */ 130 enum { 131 DONE=0xffff 132 }; 133 134 // Constructors 135 136 /** 137 * Creates a new <code>Normalizer</code> object for iterating over the 138 * normalized form of a given string. 139 * <p> 140 * @param str The string to be normalized. The normalization 141 * will start at the beginning of the string. 142 * 143 * @param mode The normalization mode. 144 * @stable ICU 2.0 145 */ 146 Normalizer(const UnicodeString& str, UNormalizationMode mode); 147 148 /** 149 * Creates a new <code>Normalizer</code> object for iterating over the 150 * normalized form of a given string. 151 * <p> 152 * @param str The string to be normalized. The normalization 153 * will start at the beginning of the string. 154 * 155 * @param length Length of the string, or -1 if NUL-terminated. 156 * @param mode The normalization mode. 157 * @stable ICU 2.0 158 */ 159 Normalizer(const UChar* str, int32_t length, UNormalizationMode mode); 160 161 /** 162 * Creates a new <code>Normalizer</code> object for iterating over the 163 * normalized form of the given text. 164 * <p> 165 * @param iter The input text to be normalized. The normalization 166 * will start at the beginning of the string. 167 * 168 * @param mode The normalization mode. 169 * @stable ICU 2.0 170 */ 171 Normalizer(const CharacterIterator& iter, UNormalizationMode mode); 172 173 /** 174 * Copy constructor. 175 * @param copy The object to be copied. 176 * @stable ICU 2.0 177 */ 178 Normalizer(const Normalizer& copy); 179 180 /** 181 * Destructor 182 * @stable ICU 2.0 183 */ 184 virtual ~Normalizer(); 185 186 187 //------------------------------------------------------------------------- 188 // Static utility methods 189 //------------------------------------------------------------------------- 190 191 /** 192 * Normalizes a <code>UnicodeString</code> according to the specified normalization mode. 193 * This is a wrapper for unorm_normalize(), using UnicodeString's. 194 * 195 * The <code>options</code> parameter specifies which optional 196 * <code>Normalizer</code> features are to be enabled for this operation. 197 * 198 * @param source the input string to be normalized. 199 * @param mode the normalization mode 200 * @param options the optional features to be enabled (0 for no options) 201 * @param result The normalized string (on output). 202 * @param status The error code. 203 * @stable ICU 2.0 204 */ 205 static void U_EXPORT2 normalize(const UnicodeString& source, 206 UNormalizationMode mode, int32_t options, 207 UnicodeString& result, 208 UErrorCode &status); 209 210 /** 211 * Compose a <code>UnicodeString</code>. 212 * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC. 213 * This is a wrapper for unorm_normalize(), using UnicodeString's. 214 * 215 * The <code>options</code> parameter specifies which optional 216 * <code>Normalizer</code> features are to be enabled for this operation. 217 * 218 * @param source the string to be composed. 219 * @param compat Perform compatibility decomposition before composition. 220 * If this argument is <code>FALSE</code>, only canonical 221 * decomposition will be performed. 222 * @param options the optional features to be enabled (0 for no options) 223 * @param result The composed string (on output). 224 * @param status The error code. 225 * @stable ICU 2.0 226 */ 227 static void U_EXPORT2 compose(const UnicodeString& source, 228 UBool compat, int32_t options, 229 UnicodeString& result, 230 UErrorCode &status); 231 232 /** 233 * Static method to decompose a <code>UnicodeString</code>. 234 * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD. 235 * This is a wrapper for unorm_normalize(), using UnicodeString's. 236 * 237 * The <code>options</code> parameter specifies which optional 238 * <code>Normalizer</code> features are to be enabled for this operation. 239 * 240 * @param source the string to be decomposed. 241 * @param compat Perform compatibility decomposition. 242 * If this argument is <code>FALSE</code>, only canonical 243 * decomposition will be performed. 244 * @param options the optional features to be enabled (0 for no options) 245 * @param result The decomposed string (on output). 246 * @param status The error code. 247 * @stable ICU 2.0 248 */ 249 static void U_EXPORT2 decompose(const UnicodeString& source, 250 UBool compat, int32_t options, 251 UnicodeString& result, 252 UErrorCode &status); 253 254 /** 255 * Performing quick check on a string, to quickly determine if the string is 256 * in a particular normalization format. 257 * This is a wrapper for unorm_quickCheck(), using a UnicodeString. 258 * 259 * Three types of result can be returned UNORM_YES, UNORM_NO or 260 * UNORM_MAYBE. Result UNORM_YES indicates that the argument 261 * string is in the desired normalized format, UNORM_NO determines that 262 * argument string is not in the desired normalized format. A 263 * UNORM_MAYBE result indicates that a more thorough check is required, 264 * the user may have to put the string in its normalized form and compare the 265 * results. 266 * @param source string for determining if it is in a normalized format 267 * @param mode normalization format 268 * @param status A reference to a UErrorCode to receive any errors 269 * @return UNORM_YES, UNORM_NO or UNORM_MAYBE 270 * 271 * @see isNormalized 272 * @stable ICU 2.0 273 */ 274 static inline UNormalizationCheckResult 275 quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status); 276 277 /** 278 * Performing quick check on a string; same as the other version of quickCheck 279 * but takes an extra options parameter like most normalization functions. 280 * 281 * @param source string for determining if it is in a normalized format 282 * @param mode normalization format 283 * @param options the optional features to be enabled (0 for no options) 284 * @param status A reference to a UErrorCode to receive any errors 285 * @return UNORM_YES, UNORM_NO or UNORM_MAYBE 286 * 287 * @see isNormalized 288 * @stable ICU 2.6 289 */ 290 static inline UNormalizationCheckResult 291 quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status); 292 293 /** 294 * Test if a string is in a given normalization form. 295 * This is semantically equivalent to source.equals(normalize(source, mode)) . 296 * 297 * Unlike unorm_quickCheck(), this function returns a definitive result, 298 * never a "maybe". 299 * For NFD, NFKD, and FCD, both functions work exactly the same. 300 * For NFC and NFKC where quickCheck may return "maybe", this function will 301 * perform further tests to arrive at a TRUE/FALSE result. 302 * 303 * @param src String that is to be tested if it is in a normalization format. 304 * @param mode Which normalization form to test for. 305 * @param errorCode ICU error code in/out parameter. 306 * Must fulfill U_SUCCESS before the function call. 307 * @return Boolean value indicating whether the source string is in the 308 * "mode" normalization form. 309 * 310 * @see quickCheck 311 * @stable ICU 2.2 312 */ 313 static inline UBool 314 isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode); 315 316 /** 317 * Test if a string is in a given normalization form; same as the other version of isNormalized 318 * but takes an extra options parameter like most normalization functions. 319 * 320 * @param src String that is to be tested if it is in a normalization format. 321 * @param mode Which normalization form to test for. 322 * @param options the optional features to be enabled (0 for no options) 323 * @param errorCode ICU error code in/out parameter. 324 * Must fulfill U_SUCCESS before the function call. 325 * @return Boolean value indicating whether the source string is in the 326 * "mode" normalization form. 327 * 328 * @see quickCheck 329 * @stable ICU 2.6 330 */ 331 static inline UBool 332 isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode); 333 334 /** 335 * Concatenate normalized strings, making sure that the result is normalized as well. 336 * 337 * If both the left and the right strings are in 338 * the normalization form according to "mode/options", 339 * then the result will be 340 * 341 * \code 342 * dest=normalize(left+right, mode, options) 343 * \endcode 344 * 345 * For details see unorm_concatenate in unorm.h. 346 * 347 * @param left Left source string. 348 * @param right Right source string. 349 * @param result The output string. 350 * @param mode The normalization mode. 351 * @param options A bit set of normalization options. 352 * @param errorCode ICU error code in/out parameter. 353 * Must fulfill U_SUCCESS before the function call. 354 * @return result 355 * 356 * @see unorm_concatenate 357 * @see normalize 358 * @see unorm_next 359 * @see unorm_previous 360 * 361 * @stable ICU 2.1 362 */ 363 static UnicodeString & 364 U_EXPORT2 concatenate(UnicodeString &left, UnicodeString &right, 365 UnicodeString &result, 366 UNormalizationMode mode, int32_t options, 367 UErrorCode &errorCode); 368 369 /** 370 * Compare two strings for canonical equivalence. 371 * Further options include case-insensitive comparison and 372 * code point order (as opposed to code unit order). 373 * 374 * Canonical equivalence between two strings is defined as their normalized 375 * forms (NFD or NFC) being identical. 376 * This function compares strings incrementally instead of normalizing 377 * (and optionally case-folding) both strings entirely, 378 * improving performance significantly. 379 * 380 * Bulk normalization is only necessary if the strings do not fulfill the FCD 381 * conditions. Only in this case, and only if the strings are relatively long, 382 * is memory allocated temporarily. 383 * For FCD strings and short non-FCD strings there is no memory allocation. 384 * 385 * Semantically, this is equivalent to 386 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 387 * where code point order and foldCase are all optional. 388 * 389 * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match 390 * the case folding must be performed first, then the normalization. 391 * 392 * @param s1 First source string. 393 * @param s2 Second source string. 394 * 395 * @param options A bit set of options: 396 * - U_FOLD_CASE_DEFAULT or 0 is used for default options: 397 * Case-sensitive comparison in code unit order, and the input strings 398 * are quick-checked for FCD. 399 * 400 * - UNORM_INPUT_IS_FCD 401 * Set if the caller knows that both s1 and s2 fulfill the FCD conditions. 402 * If not set, the function will quickCheck for FCD 403 * and normalize if necessary. 404 * 405 * - U_COMPARE_CODE_POINT_ORDER 406 * Set to choose code point order instead of code unit order 407 * (see u_strCompare for details). 408 * 409 * - U_COMPARE_IGNORE_CASE 410 * Set to compare strings case-insensitively using case folding, 411 * instead of case-sensitively. 412 * If set, then the following case folding options are used. 413 * 414 * - Options as used with case-insensitive comparisons, currently: 415 * 416 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I 417 * (see u_strCaseCompare for details) 418 * 419 * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT 420 * 421 * @param errorCode ICU error code in/out parameter. 422 * Must fulfill U_SUCCESS before the function call. 423 * @return <0 or 0 or >0 as usual for string comparisons 424 * 425 * @see unorm_compare 426 * @see normalize 427 * @see UNORM_FCD 428 * @see u_strCompare 429 * @see u_strCaseCompare 430 * 431 * @stable ICU 2.2 432 */ 433 static inline int32_t 434 compare(const UnicodeString &s1, const UnicodeString &s2, 435 uint32_t options, 436 UErrorCode &errorCode); 437 438 //------------------------------------------------------------------------- 439 // Iteration API 440 //------------------------------------------------------------------------- 441 442 /** 443 * Return the current character in the normalized text. 444 * current() may need to normalize some text at getIndex(). 445 * The getIndex() is not changed. 446 * 447 * @return the current normalized code point 448 * @stable ICU 2.0 449 */ 450 UChar32 current(void); 451 452 /** 453 * Return the first character in the normalized text. 454 * This is equivalent to setIndexOnly(startIndex()) followed by next(). 455 * (Post-increment semantics.) 456 * 457 * @return the first normalized code point 458 * @stable ICU 2.0 459 */ 460 UChar32 first(void); 461 462 /** 463 * Return the last character in the normalized text. 464 * This is equivalent to setIndexOnly(endIndex()) followed by previous(). 465 * (Pre-decrement semantics.) 466 * 467 * @return the last normalized code point 468 * @stable ICU 2.0 469 */ 470 UChar32 last(void); 471 472 /** 473 * Return the next character in the normalized text. 474 * (Post-increment semantics.) 475 * If the end of the text has already been reached, DONE is returned. 476 * The DONE value could be confused with a U+FFFF non-character code point 477 * in the text. If this is possible, you can test getIndex()<endIndex() 478 * before calling next(), or (getIndex()<endIndex() || last()!=DONE) 479 * after calling next(). (Calling last() will change the iterator state!) 480 * 481 * The C API unorm_next() is more efficient and does not have this ambiguity. 482 * 483 * @return the next normalized code point 484 * @stable ICU 2.0 485 */ 486 UChar32 next(void); 487 488 /** 489 * Return the previous character in the normalized text and decrement. 490 * (Pre-decrement semantics.) 491 * If the beginning of the text has already been reached, DONE is returned. 492 * The DONE value could be confused with a U+FFFF non-character code point 493 * in the text. If this is possible, you can test 494 * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change 495 * the iterator state!) 496 * 497 * The C API unorm_previous() is more efficient and does not have this ambiguity. 498 * 499 * @return the previous normalized code point 500 * @stable ICU 2.0 501 */ 502 UChar32 previous(void); 503 504 /** 505 * Set the iteration position in the input text that is being normalized, 506 * without any immediate normalization. 507 * After setIndexOnly(), getIndex() will return the same index that is 508 * specified here. 509 * 510 * @param index the desired index in the input text. 511 * @stable ICU 2.0 512 */ 513 void setIndexOnly(int32_t index); 514 515 /** 516 * Reset the index to the beginning of the text. 517 * This is equivalent to setIndexOnly(startIndex)). 518 * @stable ICU 2.0 519 */ 520 void reset(void); 521 522 /** 523 * Retrieve the current iteration position in the input text that is 524 * being normalized. 525 * 526 * A following call to next() will return a normalized code point from 527 * the input text at or after this index. 528 * 529 * After a call to previous(), getIndex() will point at or before the 530 * position in the input text where the normalized code point 531 * was returned from with previous(). 532 * 533 * @return the current index in the input text 534 * @stable ICU 2.0 535 */ 536 int32_t getIndex(void) const; 537 538 /** 539 * Retrieve the index of the start of the input text. This is the begin index 540 * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string 541 * over which this <code>Normalizer</code> is iterating. 542 * 543 * @return the smallest index in the input text where the Normalizer operates 544 * @stable ICU 2.0 545 */ 546 int32_t startIndex(void) const; 547 548 /** 549 * Retrieve the index of the end of the input text. This is the end index 550 * of the <code>CharacterIterator</code> or the length of the string 551 * over which this <code>Normalizer</code> is iterating. 552 * This end index is exclusive, i.e., the Normalizer operates only on characters 553 * before this index. 554 * 555 * @return the first index in the input text where the Normalizer does not operate 556 * @stable ICU 2.0 557 */ 558 int32_t endIndex(void) const; 559 560 /** 561 * Returns TRUE when both iterators refer to the same character in the same 562 * input text. 563 * 564 * @param that a Normalizer object to compare this one to 565 * @return comparison result 566 * @stable ICU 2.0 567 */ 568 UBool operator==(const Normalizer& that) const; 569 570 /** 571 * Returns FALSE when both iterators refer to the same character in the same 572 * input text. 573 * 574 * @param that a Normalizer object to compare this one to 575 * @return comparison result 576 * @stable ICU 2.0 577 */ 578 inline UBool operator!=(const Normalizer& that) const; 579 580 /** 581 * Returns a pointer to a new Normalizer that is a clone of this one. 582 * The caller is responsible for deleting the new clone. 583 * @return a pointer to a new Normalizer 584 * @stable ICU 2.0 585 */ 586 Normalizer* clone(void) const; 587 588 /** 589 * Generates a hash code for this iterator. 590 * 591 * @return the hash code 592 * @stable ICU 2.0 593 */ 594 int32_t hashCode(void) const; 595 596 //------------------------------------------------------------------------- 597 // Property access methods 598 //------------------------------------------------------------------------- 599 600 /** 601 * Set the normalization mode for this object. 602 * <p> 603 * <b>Note:</b>If the normalization mode is changed while iterating 604 * over a string, calls to {@link #next() } and {@link #previous() } may 605 * return previously buffers characters in the old normalization mode 606 * until the iteration is able to re-sync at the next base character. 607 * It is safest to call {@link #setIndexOnly }, {@link #reset() }, 608 * {@link #setText }, {@link #first() }, 609 * {@link #last() }, etc. after calling <code>setMode</code>. 610 * <p> 611 * @param newMode the new mode for this <code>Normalizer</code>. 612 * @see #getUMode 613 * @stable ICU 2.0 614 */ 615 void setMode(UNormalizationMode newMode); 616 617 /** 618 * Return the normalization mode for this object. 619 * 620 * This is an unusual name because there used to be a getMode() that 621 * returned a different type. 622 * 623 * @return the mode for this <code>Normalizer</code> 624 * @see #setMode 625 * @stable ICU 2.0 626 */ 627 UNormalizationMode getUMode(void) const; 628 629 /** 630 * Set options that affect this <code>Normalizer</code>'s operation. 631 * Options do not change the basic composition or decomposition operation 632 * that is being performed, but they control whether 633 * certain optional portions of the operation are done. 634 * Currently the only available option is obsolete. 635 * 636 * It is possible to specify multiple options that are all turned on or off. 637 * 638 * @param option the option(s) whose value is/are to be set. 639 * @param value the new setting for the option. Use <code>TRUE</code> to 640 * turn the option(s) on and <code>FALSE</code> to turn it/them off. 641 * 642 * @see #getOption 643 * @stable ICU 2.0 644 */ 645 void setOption(int32_t option, 646 UBool value); 647 648 /** 649 * Determine whether an option is turned on or off. 650 * If multiple options are specified, then the result is TRUE if any 651 * of them are set. 652 * <p> 653 * @param option the option(s) that are to be checked 654 * @return TRUE if any of the option(s) are set 655 * @see #setOption 656 * @stable ICU 2.0 657 */ 658 UBool getOption(int32_t option) const; 659 660 /** 661 * Set the input text over which this <code>Normalizer</code> will iterate. 662 * The iteration position is set to the beginning. 663 * 664 * @param newText a string that replaces the current input text 665 * @param status a UErrorCode 666 * @stable ICU 2.0 667 */ 668 void setText(const UnicodeString& newText, 669 UErrorCode &status); 670 671 /** 672 * Set the input text over which this <code>Normalizer</code> will iterate. 673 * The iteration position is set to the beginning. 674 * 675 * @param newText a CharacterIterator object that replaces the current input text 676 * @param status a UErrorCode 677 * @stable ICU 2.0 678 */ 679 void setText(const CharacterIterator& newText, 680 UErrorCode &status); 681 682 /** 683 * Set the input text over which this <code>Normalizer</code> will iterate. 684 * The iteration position is set to the beginning. 685 * 686 * @param newText a string that replaces the current input text 687 * @param length the length of the string, or -1 if NUL-terminated 688 * @param status a UErrorCode 689 * @stable ICU 2.0 690 */ 691 void setText(const UChar* newText, 692 int32_t length, 693 UErrorCode &status); 694 /** 695 * Copies the input text into the UnicodeString argument. 696 * 697 * @param result Receives a copy of the text under iteration. 698 * @stable ICU 2.0 699 */ 700 void getText(UnicodeString& result); 701 702 /** 703 * ICU "poor man's RTTI", returns a UClassID for this class. 704 * @returns a UClassID for this class. 705 * @stable ICU 2.2 706 */ 707 static UClassID U_EXPORT2 getStaticClassID(); 708 709 /** 710 * ICU "poor man's RTTI", returns a UClassID for the actual class. 711 * @return a UClassID for the actual class. 712 * @stable ICU 2.2 713 */ 714 virtual UClassID getDynamicClassID() const; 715 716private: 717 //------------------------------------------------------------------------- 718 // Private functions 719 //------------------------------------------------------------------------- 720 721 Normalizer(); // default constructor not implemented 722 Normalizer &operator=(const Normalizer &that); // assignment operator not implemented 723 724 // Private utility methods for iteration 725 // For documentation, see the source code 726 UBool nextNormalize(); 727 UBool previousNormalize(); 728 729 void init(CharacterIterator *iter); 730 void clearBuffer(void); 731 732 //------------------------------------------------------------------------- 733 // Private data 734 //------------------------------------------------------------------------- 735 736 UNormalizationMode fUMode; 737 int32_t fOptions; 738 739 // The input text and our position in it 740 UCharIterator *text; 741 742 // The normalization buffer is the result of normalization 743 // of the source in [currentIndex..nextIndex[ . 744 int32_t currentIndex, nextIndex; 745 746 // A buffer for holding intermediate results 747 UnicodeString buffer; 748 int32_t bufferPos; 749 750}; 751 752//------------------------------------------------------------------------- 753// Inline implementations 754//------------------------------------------------------------------------- 755 756inline UBool 757Normalizer::operator!= (const Normalizer& other) const 758{ return ! operator==(other); } 759 760inline UNormalizationCheckResult 761Normalizer::quickCheck(const UnicodeString& source, 762 UNormalizationMode mode, 763 UErrorCode &status) { 764 if(U_FAILURE(status)) { 765 return UNORM_MAYBE; 766 } 767 768 return unorm_quickCheck(source.getBuffer(), source.length(), 769 mode, &status); 770} 771 772inline UNormalizationCheckResult 773Normalizer::quickCheck(const UnicodeString& source, 774 UNormalizationMode mode, int32_t options, 775 UErrorCode &status) { 776 if(U_FAILURE(status)) { 777 return UNORM_MAYBE; 778 } 779 780 return unorm_quickCheckWithOptions(source.getBuffer(), source.length(), 781 mode, options, &status); 782} 783 784inline UBool 785Normalizer::isNormalized(const UnicodeString& source, 786 UNormalizationMode mode, 787 UErrorCode &status) { 788 if(U_FAILURE(status)) { 789 return FALSE; 790 } 791 792 return unorm_isNormalized(source.getBuffer(), source.length(), 793 mode, &status); 794} 795 796inline UBool 797Normalizer::isNormalized(const UnicodeString& source, 798 UNormalizationMode mode, int32_t options, 799 UErrorCode &status) { 800 if(U_FAILURE(status)) { 801 return FALSE; 802 } 803 804 return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(), 805 mode, options, &status); 806} 807 808inline int32_t 809Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2, 810 uint32_t options, 811 UErrorCode &errorCode) { 812 // all argument checking is done in unorm_compare 813 return unorm_compare(s1.getBuffer(), s1.length(), 814 s2.getBuffer(), s2.length(), 815 options, 816 &errorCode); 817} 818 819U_NAMESPACE_END 820 821#endif /* #if !UCONFIG_NO_NORMALIZATION */ 822 823#endif // NORMLZR_H 824