1/*
2 ********************************************************************
3 * COPYRIGHT:
4 * Copyright (c) 1996-2006, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************
7 */
8
9#ifndef NORMLZR_H
10#define NORMLZR_H
11
12#include "unicode/utypes.h"
13
14/**
15 * \file
16 * \brief C++ API: Unicode Normalization
17 */
18
19#if !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/uobject.h"
22#include "unicode/unistr.h"
23#include "unicode/chariter.h"
24#include "unicode/unorm.h"
25
26
27struct UCharIterator;
28typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
29
30U_NAMESPACE_BEGIN
31/**
32 * The Normalizer class supports the standard normalization forms described in
33 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
34 * Unicode Standard Annex #15: Unicode Normalization Forms</a>.
35 *
36 * The Normalizer class consists of two parts:
37 * - static functions that normalize strings or test if strings are normalized
38 * - a Normalizer object is an iterator that takes any kind of text and
39 *   provides iteration over its normalized form
40 *
41 * The Normalizer class is not suitable for subclassing.
42 *
43 * The static functions are basically wrappers around the C implementation,
44 * using UnicodeString instead of UChar*.
45 * For basic information about normalization forms and details about the C API
46 * please see the documentation in unorm.h.
47 *
48 * The iterator API with the Normalizer constructors and the non-static functions
49 * uses a CharacterIterator as input. It is possible to pass a string which
50 * is then internally wrapped in a CharacterIterator.
51 * The input text is not normalized all at once, but incrementally where needed
52 * (providing efficient random access).
53 * This allows to pass in a large text but spend only a small amount of time
54 * normalizing a small part of that text.
55 * However, if the entire text is normalized, then the iterator will be
56 * slower than normalizing the entire text at once and iterating over the result.
57 * A possible use of the Normalizer iterator is also to report an index into the
58 * original text that is close to where the normalized characters come from.
59 *
60 * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
61 * The earlier implementation reported the getIndex() inconsistently,
62 * and previous() could not be used after setIndex(), next(), first(), and current().
63 *
64 * Normalizer allows to start normalizing from anywhere in the input text by
65 * calling setIndexOnly(), first(), or last().
66 * Without calling any of these, the iterator will start at the beginning of the text.
67 *
68 * At any time, next() returns the next normalized code point (UChar32),
69 * with post-increment semantics (like CharacterIterator::next32PostInc()).
70 * previous() returns the previous normalized code point (UChar32),
71 * with pre-decrement semantics (like CharacterIterator::previous32()).
72 *
73 * current() returns the current code point
74 * (respectively the one at the newly set index) without moving
75 * the getIndex(). Note that if the text at the current position
76 * needs to be normalized, then these functions will do that.
77 * (This is why current() is not const.)
78 * It is more efficient to call setIndexOnly() instead, which does not
79 * normalize.
80 *
81 * getIndex() always refers to the position in the input text where the normalized
82 * code points are returned from. It does not always change with each returned
83 * code point.
84 * The code point that is returned from any of the functions
85 * corresponds to text at or after getIndex(), according to the
86 * function's iteration semantics (post-increment or pre-decrement).
87 *
88 * next() returns a code point from at or after the getIndex()
89 * from before the next() call. After the next() call, the getIndex()
90 * might have moved to where the next code point will be returned from
91 * (from a next() or current() call).
92 * This is semantically equivalent to array access with array[index++]
93 * (post-increment semantics).
94 *
95 * previous() returns a code point from at or after the getIndex()
96 * from after the previous() call.
97 * This is semantically equivalent to array access with array[--index]
98 * (pre-decrement semantics).
99 *
100 * Internally, the Normalizer iterator normalizes a small piece of text
101 * starting at the getIndex() and ending at a following "safe" index.
102 * The normalized results is stored in an internal string buffer, and
103 * the code points are iterated from there.
104 * With multiple iteration calls, this is repeated until the next piece
105 * of text needs to be normalized, and the getIndex() needs to be moved.
106 *
107 * The following "safe" index, the internal buffer, and the secondary
108 * iteration index into that buffer are not exposed on the API.
109 * This also means that it is currently not practical to return to
110 * a particular, arbitrary position in the text because one would need to
111 * know, and be able to set, in addition to the getIndex(), at least also the
112 * current index into the internal buffer.
113 * It is currently only possible to observe when getIndex() changes
114 * (with careful consideration of the iteration semantics),
115 * at which time the internal index will be 0.
116 * For example, if getIndex() is different after next() than before it,
117 * then the internal index is 0 and one can return to this getIndex()
118 * later with setIndexOnly().
119 *
120 * @author Laura Werner, Mark Davis, Markus Scherer
121 * @stable ICU 2.0
122 */
123class U_COMMON_API Normalizer : public UObject {
124public:
125  /**
126   * If DONE is returned from an iteration function that returns a code point,
127   * then there are no more normalization results available.
128   * @stable ICU 2.0
129   */
130  enum {
131      DONE=0xffff
132  };
133
134  // Constructors
135
136  /**
137   * Creates a new <code>Normalizer</code> object for iterating over the
138   * normalized form of a given string.
139   * <p>
140   * @param str   The string to be normalized.  The normalization
141   *              will start at the beginning of the string.
142   *
143   * @param mode  The normalization mode.
144   * @stable ICU 2.0
145   */
146  Normalizer(const UnicodeString& str, UNormalizationMode mode);
147
148  /**
149   * Creates a new <code>Normalizer</code> object for iterating over the
150   * normalized form of a given string.
151   * <p>
152   * @param str   The string to be normalized.  The normalization
153   *              will start at the beginning of the string.
154   *
155   * @param length Length of the string, or -1 if NUL-terminated.
156   * @param mode  The normalization mode.
157   * @stable ICU 2.0
158   */
159  Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
160
161  /**
162   * Creates a new <code>Normalizer</code> object for iterating over the
163   * normalized form of the given text.
164   * <p>
165   * @param iter  The input text to be normalized.  The normalization
166   *              will start at the beginning of the string.
167   *
168   * @param mode  The normalization mode.
169   * @stable ICU 2.0
170   */
171  Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
172
173  /**
174   * Copy constructor.
175   * @param copy The object to be copied.
176   * @stable ICU 2.0
177   */
178  Normalizer(const Normalizer& copy);
179
180  /**
181   * Destructor
182   * @stable ICU 2.0
183   */
184  virtual ~Normalizer();
185
186
187  //-------------------------------------------------------------------------
188  // Static utility methods
189  //-------------------------------------------------------------------------
190
191  /**
192   * Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
193   * This is a wrapper for unorm_normalize(), using UnicodeString's.
194   *
195   * The <code>options</code> parameter specifies which optional
196   * <code>Normalizer</code> features are to be enabled for this operation.
197   *
198   * @param source    the input string to be normalized.
199   * @param mode      the normalization mode
200   * @param options   the optional features to be enabled (0 for no options)
201   * @param result    The normalized string (on output).
202   * @param status    The error code.
203   * @stable ICU 2.0
204   */
205  static void U_EXPORT2 normalize(const UnicodeString& source,
206                        UNormalizationMode mode, int32_t options,
207                        UnicodeString& result,
208                        UErrorCode &status);
209
210  /**
211   * Compose a <code>UnicodeString</code>.
212   * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
213   * This is a wrapper for unorm_normalize(), using UnicodeString's.
214   *
215   * The <code>options</code> parameter specifies which optional
216   * <code>Normalizer</code> features are to be enabled for this operation.
217   *
218   * @param source    the string to be composed.
219   * @param compat    Perform compatibility decomposition before composition.
220   *                  If this argument is <code>FALSE</code>, only canonical
221   *                  decomposition will be performed.
222   * @param options   the optional features to be enabled (0 for no options)
223   * @param result    The composed string (on output).
224   * @param status    The error code.
225   * @stable ICU 2.0
226   */
227  static void U_EXPORT2 compose(const UnicodeString& source,
228                      UBool compat, int32_t options,
229                      UnicodeString& result,
230                      UErrorCode &status);
231
232  /**
233   * Static method to decompose a <code>UnicodeString</code>.
234   * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
235   * This is a wrapper for unorm_normalize(), using UnicodeString's.
236   *
237   * The <code>options</code> parameter specifies which optional
238   * <code>Normalizer</code> features are to be enabled for this operation.
239   *
240   * @param source    the string to be decomposed.
241   * @param compat    Perform compatibility decomposition.
242   *                  If this argument is <code>FALSE</code>, only canonical
243   *                  decomposition will be performed.
244   * @param options   the optional features to be enabled (0 for no options)
245   * @param result    The decomposed string (on output).
246   * @param status    The error code.
247   * @stable ICU 2.0
248   */
249  static void U_EXPORT2 decompose(const UnicodeString& source,
250                        UBool compat, int32_t options,
251                        UnicodeString& result,
252                        UErrorCode &status);
253
254  /**
255   * Performing quick check on a string, to quickly determine if the string is
256   * in a particular normalization format.
257   * This is a wrapper for unorm_quickCheck(), using a UnicodeString.
258   *
259   * Three types of result can be returned UNORM_YES, UNORM_NO or
260   * UNORM_MAYBE. Result UNORM_YES indicates that the argument
261   * string is in the desired normalized format, UNORM_NO determines that
262   * argument string is not in the desired normalized format. A
263   * UNORM_MAYBE result indicates that a more thorough check is required,
264   * the user may have to put the string in its normalized form and compare the
265   * results.
266   * @param source       string for determining if it is in a normalized format
267   * @param mode         normalization format
268   * @param status A reference to a UErrorCode to receive any errors
269   * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
270   *
271   * @see isNormalized
272   * @stable ICU 2.0
273   */
274  static inline UNormalizationCheckResult
275  quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
276
277  /**
278   * Performing quick check on a string; same as the other version of quickCheck
279   * but takes an extra options parameter like most normalization functions.
280   *
281   * @param source       string for determining if it is in a normalized format
282   * @param mode         normalization format
283   * @param options      the optional features to be enabled (0 for no options)
284   * @param status A reference to a UErrorCode to receive any errors
285   * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
286   *
287   * @see isNormalized
288   * @stable ICU 2.6
289   */
290  static inline UNormalizationCheckResult
291  quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
292
293  /**
294   * Test if a string is in a given normalization form.
295   * This is semantically equivalent to source.equals(normalize(source, mode)) .
296   *
297   * Unlike unorm_quickCheck(), this function returns a definitive result,
298   * never a "maybe".
299   * For NFD, NFKD, and FCD, both functions work exactly the same.
300   * For NFC and NFKC where quickCheck may return "maybe", this function will
301   * perform further tests to arrive at a TRUE/FALSE result.
302   *
303   * @param src        String that is to be tested if it is in a normalization format.
304   * @param mode       Which normalization form to test for.
305   * @param errorCode  ICU error code in/out parameter.
306   *                   Must fulfill U_SUCCESS before the function call.
307   * @return Boolean value indicating whether the source string is in the
308   *         "mode" normalization form.
309   *
310   * @see quickCheck
311   * @stable ICU 2.2
312   */
313  static inline UBool
314  isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
315
316  /**
317   * Test if a string is in a given normalization form; same as the other version of isNormalized
318   * but takes an extra options parameter like most normalization functions.
319   *
320   * @param src        String that is to be tested if it is in a normalization format.
321   * @param mode       Which normalization form to test for.
322   * @param options      the optional features to be enabled (0 for no options)
323   * @param errorCode  ICU error code in/out parameter.
324   *                   Must fulfill U_SUCCESS before the function call.
325   * @return Boolean value indicating whether the source string is in the
326   *         "mode" normalization form.
327   *
328   * @see quickCheck
329   * @stable ICU 2.6
330   */
331  static inline UBool
332  isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
333
334  /**
335   * Concatenate normalized strings, making sure that the result is normalized as well.
336   *
337   * If both the left and the right strings are in
338   * the normalization form according to "mode/options",
339   * then the result will be
340   *
341   * \code
342   *     dest=normalize(left+right, mode, options)
343   * \endcode
344   *
345   * For details see unorm_concatenate in unorm.h.
346   *
347   * @param left Left source string.
348   * @param right Right source string.
349   * @param result The output string.
350   * @param mode The normalization mode.
351   * @param options A bit set of normalization options.
352   * @param errorCode ICU error code in/out parameter.
353   *                   Must fulfill U_SUCCESS before the function call.
354   * @return result
355   *
356   * @see unorm_concatenate
357   * @see normalize
358   * @see unorm_next
359   * @see unorm_previous
360   *
361   * @stable ICU 2.1
362   */
363  static UnicodeString &
364  U_EXPORT2 concatenate(UnicodeString &left, UnicodeString &right,
365              UnicodeString &result,
366              UNormalizationMode mode, int32_t options,
367              UErrorCode &errorCode);
368
369  /**
370   * Compare two strings for canonical equivalence.
371   * Further options include case-insensitive comparison and
372   * code point order (as opposed to code unit order).
373   *
374   * Canonical equivalence between two strings is defined as their normalized
375   * forms (NFD or NFC) being identical.
376   * This function compares strings incrementally instead of normalizing
377   * (and optionally case-folding) both strings entirely,
378   * improving performance significantly.
379   *
380   * Bulk normalization is only necessary if the strings do not fulfill the FCD
381   * conditions. Only in this case, and only if the strings are relatively long,
382   * is memory allocated temporarily.
383   * For FCD strings and short non-FCD strings there is no memory allocation.
384   *
385   * Semantically, this is equivalent to
386   *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
387   * where code point order and foldCase are all optional.
388   *
389   * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
390   * the case folding must be performed first, then the normalization.
391   *
392   * @param s1 First source string.
393   * @param s2 Second source string.
394   *
395   * @param options A bit set of options:
396   *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
397   *     Case-sensitive comparison in code unit order, and the input strings
398   *     are quick-checked for FCD.
399   *
400   *   - UNORM_INPUT_IS_FCD
401   *     Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
402   *     If not set, the function will quickCheck for FCD
403   *     and normalize if necessary.
404   *
405   *   - U_COMPARE_CODE_POINT_ORDER
406   *     Set to choose code point order instead of code unit order
407   *     (see u_strCompare for details).
408   *
409   *   - U_COMPARE_IGNORE_CASE
410   *     Set to compare strings case-insensitively using case folding,
411   *     instead of case-sensitively.
412   *     If set, then the following case folding options are used.
413   *
414   *   - Options as used with case-insensitive comparisons, currently:
415   *
416   *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
417   *    (see u_strCaseCompare for details)
418   *
419   *   - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
420   *
421   * @param errorCode ICU error code in/out parameter.
422   *                  Must fulfill U_SUCCESS before the function call.
423   * @return <0 or 0 or >0 as usual for string comparisons
424   *
425   * @see unorm_compare
426   * @see normalize
427   * @see UNORM_FCD
428   * @see u_strCompare
429   * @see u_strCaseCompare
430   *
431   * @stable ICU 2.2
432   */
433  static inline int32_t
434  compare(const UnicodeString &s1, const UnicodeString &s2,
435          uint32_t options,
436          UErrorCode &errorCode);
437
438  //-------------------------------------------------------------------------
439  // Iteration API
440  //-------------------------------------------------------------------------
441
442  /**
443   * Return the current character in the normalized text.
444   * current() may need to normalize some text at getIndex().
445   * The getIndex() is not changed.
446   *
447   * @return the current normalized code point
448   * @stable ICU 2.0
449   */
450  UChar32              current(void);
451
452  /**
453   * Return the first character in the normalized text.
454   * This is equivalent to setIndexOnly(startIndex()) followed by next().
455   * (Post-increment semantics.)
456   *
457   * @return the first normalized code point
458   * @stable ICU 2.0
459   */
460  UChar32              first(void);
461
462  /**
463   * Return the last character in the normalized text.
464   * This is equivalent to setIndexOnly(endIndex()) followed by previous().
465   * (Pre-decrement semantics.)
466   *
467   * @return the last normalized code point
468   * @stable ICU 2.0
469   */
470  UChar32              last(void);
471
472  /**
473   * Return the next character in the normalized text.
474   * (Post-increment semantics.)
475   * If the end of the text has already been reached, DONE is returned.
476   * The DONE value could be confused with a U+FFFF non-character code point
477   * in the text. If this is possible, you can test getIndex()<endIndex()
478   * before calling next(), or (getIndex()<endIndex() || last()!=DONE)
479   * after calling next(). (Calling last() will change the iterator state!)
480   *
481   * The C API unorm_next() is more efficient and does not have this ambiguity.
482   *
483   * @return the next normalized code point
484   * @stable ICU 2.0
485   */
486  UChar32              next(void);
487
488  /**
489   * Return the previous character in the normalized text and decrement.
490   * (Pre-decrement semantics.)
491   * If the beginning of the text has already been reached, DONE is returned.
492   * The DONE value could be confused with a U+FFFF non-character code point
493   * in the text. If this is possible, you can test
494   * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change
495   * the iterator state!)
496   *
497   * The C API unorm_previous() is more efficient and does not have this ambiguity.
498   *
499   * @return the previous normalized code point
500   * @stable ICU 2.0
501   */
502  UChar32              previous(void);
503
504  /**
505   * Set the iteration position in the input text that is being normalized,
506   * without any immediate normalization.
507   * After setIndexOnly(), getIndex() will return the same index that is
508   * specified here.
509   *
510   * @param index the desired index in the input text.
511   * @stable ICU 2.0
512   */
513  void                 setIndexOnly(int32_t index);
514
515  /**
516   * Reset the index to the beginning of the text.
517   * This is equivalent to setIndexOnly(startIndex)).
518   * @stable ICU 2.0
519   */
520  void                reset(void);
521
522  /**
523   * Retrieve the current iteration position in the input text that is
524   * being normalized.
525   *
526   * A following call to next() will return a normalized code point from
527   * the input text at or after this index.
528   *
529   * After a call to previous(), getIndex() will point at or before the
530   * position in the input text where the normalized code point
531   * was returned from with previous().
532   *
533   * @return the current index in the input text
534   * @stable ICU 2.0
535   */
536  int32_t            getIndex(void) const;
537
538  /**
539   * Retrieve the index of the start of the input text. This is the begin index
540   * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
541   * over which this <code>Normalizer</code> is iterating.
542   *
543   * @return the smallest index in the input text where the Normalizer operates
544   * @stable ICU 2.0
545   */
546  int32_t            startIndex(void) const;
547
548  /**
549   * Retrieve the index of the end of the input text. This is the end index
550   * of the <code>CharacterIterator</code> or the length of the string
551   * over which this <code>Normalizer</code> is iterating.
552   * This end index is exclusive, i.e., the Normalizer operates only on characters
553   * before this index.
554   *
555   * @return the first index in the input text where the Normalizer does not operate
556   * @stable ICU 2.0
557   */
558  int32_t            endIndex(void) const;
559
560  /**
561   * Returns TRUE when both iterators refer to the same character in the same
562   * input text.
563   *
564   * @param that a Normalizer object to compare this one to
565   * @return comparison result
566   * @stable ICU 2.0
567   */
568  UBool        operator==(const Normalizer& that) const;
569
570  /**
571   * Returns FALSE when both iterators refer to the same character in the same
572   * input text.
573   *
574   * @param that a Normalizer object to compare this one to
575   * @return comparison result
576   * @stable ICU 2.0
577   */
578  inline UBool        operator!=(const Normalizer& that) const;
579
580  /**
581   * Returns a pointer to a new Normalizer that is a clone of this one.
582   * The caller is responsible for deleting the new clone.
583   * @return a pointer to a new Normalizer
584   * @stable ICU 2.0
585   */
586  Normalizer*        clone(void) const;
587
588  /**
589   * Generates a hash code for this iterator.
590   *
591   * @return the hash code
592   * @stable ICU 2.0
593   */
594  int32_t                hashCode(void) const;
595
596  //-------------------------------------------------------------------------
597  // Property access methods
598  //-------------------------------------------------------------------------
599
600  /**
601   * Set the normalization mode for this object.
602   * <p>
603   * <b>Note:</b>If the normalization mode is changed while iterating
604   * over a string, calls to {@link #next() } and {@link #previous() } may
605   * return previously buffers characters in the old normalization mode
606   * until the iteration is able to re-sync at the next base character.
607   * It is safest to call {@link #setIndexOnly }, {@link #reset() },
608   * {@link #setText }, {@link #first() },
609   * {@link #last() }, etc. after calling <code>setMode</code>.
610   * <p>
611   * @param newMode the new mode for this <code>Normalizer</code>.
612   * @see #getUMode
613   * @stable ICU 2.0
614   */
615  void setMode(UNormalizationMode newMode);
616
617  /**
618   * Return the normalization mode for this object.
619   *
620   * This is an unusual name because there used to be a getMode() that
621   * returned a different type.
622   *
623   * @return the mode for this <code>Normalizer</code>
624   * @see #setMode
625   * @stable ICU 2.0
626   */
627  UNormalizationMode getUMode(void) const;
628
629  /**
630   * Set options that affect this <code>Normalizer</code>'s operation.
631   * Options do not change the basic composition or decomposition operation
632   * that is being performed, but they control whether
633   * certain optional portions of the operation are done.
634   * Currently the only available option is obsolete.
635   *
636   * It is possible to specify multiple options that are all turned on or off.
637   *
638   * @param   option  the option(s) whose value is/are to be set.
639   * @param   value   the new setting for the option.  Use <code>TRUE</code> to
640   *                  turn the option(s) on and <code>FALSE</code> to turn it/them off.
641   *
642   * @see #getOption
643   * @stable ICU 2.0
644   */
645  void setOption(int32_t option,
646         UBool value);
647
648  /**
649   * Determine whether an option is turned on or off.
650   * If multiple options are specified, then the result is TRUE if any
651   * of them are set.
652   * <p>
653   * @param option the option(s) that are to be checked
654   * @return TRUE if any of the option(s) are set
655   * @see #setOption
656   * @stable ICU 2.0
657   */
658  UBool getOption(int32_t option) const;
659
660  /**
661   * Set the input text over which this <code>Normalizer</code> will iterate.
662   * The iteration position is set to the beginning.
663   *
664   * @param newText a string that replaces the current input text
665   * @param status a UErrorCode
666   * @stable ICU 2.0
667   */
668  void setText(const UnicodeString& newText,
669           UErrorCode &status);
670
671  /**
672   * Set the input text over which this <code>Normalizer</code> will iterate.
673   * The iteration position is set to the beginning.
674   *
675   * @param newText a CharacterIterator object that replaces the current input text
676   * @param status a UErrorCode
677   * @stable ICU 2.0
678   */
679  void setText(const CharacterIterator& newText,
680           UErrorCode &status);
681
682  /**
683   * Set the input text over which this <code>Normalizer</code> will iterate.
684   * The iteration position is set to the beginning.
685   *
686   * @param newText a string that replaces the current input text
687   * @param length the length of the string, or -1 if NUL-terminated
688   * @param status a UErrorCode
689   * @stable ICU 2.0
690   */
691  void setText(const UChar* newText,
692                    int32_t length,
693            UErrorCode &status);
694  /**
695   * Copies the input text into the UnicodeString argument.
696   *
697   * @param result Receives a copy of the text under iteration.
698   * @stable ICU 2.0
699   */
700  void            getText(UnicodeString&  result);
701
702  /**
703   * ICU "poor man's RTTI", returns a UClassID for this class.
704   * @returns a UClassID for this class.
705   * @stable ICU 2.2
706   */
707  static UClassID U_EXPORT2 getStaticClassID();
708
709  /**
710   * ICU "poor man's RTTI", returns a UClassID for the actual class.
711   * @return a UClassID for the actual class.
712   * @stable ICU 2.2
713   */
714  virtual UClassID getDynamicClassID() const;
715
716private:
717  //-------------------------------------------------------------------------
718  // Private functions
719  //-------------------------------------------------------------------------
720
721  Normalizer(); // default constructor not implemented
722  Normalizer &operator=(const Normalizer &that); // assignment operator not implemented
723
724  // Private utility methods for iteration
725  // For documentation, see the source code
726  UBool nextNormalize();
727  UBool previousNormalize();
728
729  void    init(CharacterIterator *iter);
730  void    clearBuffer(void);
731
732  //-------------------------------------------------------------------------
733  // Private data
734  //-------------------------------------------------------------------------
735
736  UNormalizationMode  fUMode;
737  int32_t             fOptions;
738
739  // The input text and our position in it
740  UCharIterator       *text;
741
742  // The normalization buffer is the result of normalization
743  // of the source in [currentIndex..nextIndex[ .
744  int32_t         currentIndex, nextIndex;
745
746  // A buffer for holding intermediate results
747  UnicodeString       buffer;
748  int32_t         bufferPos;
749
750};
751
752//-------------------------------------------------------------------------
753// Inline implementations
754//-------------------------------------------------------------------------
755
756inline UBool
757Normalizer::operator!= (const Normalizer& other) const
758{ return ! operator==(other); }
759
760inline UNormalizationCheckResult
761Normalizer::quickCheck(const UnicodeString& source,
762                       UNormalizationMode mode,
763                       UErrorCode &status) {
764    if(U_FAILURE(status)) {
765        return UNORM_MAYBE;
766    }
767
768    return unorm_quickCheck(source.getBuffer(), source.length(),
769                            mode, &status);
770}
771
772inline UNormalizationCheckResult
773Normalizer::quickCheck(const UnicodeString& source,
774                       UNormalizationMode mode, int32_t options,
775                       UErrorCode &status) {
776    if(U_FAILURE(status)) {
777        return UNORM_MAYBE;
778    }
779
780    return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),
781                                       mode, options, &status);
782}
783
784inline UBool
785Normalizer::isNormalized(const UnicodeString& source,
786                         UNormalizationMode mode,
787                         UErrorCode &status) {
788    if(U_FAILURE(status)) {
789        return FALSE;
790    }
791
792    return unorm_isNormalized(source.getBuffer(), source.length(),
793                              mode, &status);
794}
795
796inline UBool
797Normalizer::isNormalized(const UnicodeString& source,
798                         UNormalizationMode mode, int32_t options,
799                         UErrorCode &status) {
800    if(U_FAILURE(status)) {
801        return FALSE;
802    }
803
804    return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),
805                                         mode, options, &status);
806}
807
808inline int32_t
809Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
810                    uint32_t options,
811                    UErrorCode &errorCode) {
812  // all argument checking is done in unorm_compare
813  return unorm_compare(s1.getBuffer(), s1.length(),
814                       s2.getBuffer(), s2.length(),
815                       options,
816                       &errorCode);
817}
818
819U_NAMESPACE_END
820
821#endif /* #if !UCONFIG_NO_NORMALIZATION */
822
823#endif // NORMLZR_H
824