1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  normalizer2.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009nov22
14*   created by: Markus W. Scherer
15*/
16
17#ifndef __NORMALIZER2_H__
18#define __NORMALIZER2_H__
19
20/**
21 * \file
22 * \brief C++ API: New API for Unicode Normalization.
23 */
24
25#include "unicode/utypes.h"
26
27#if !UCONFIG_NO_NORMALIZATION
28
29#include "unicode/uniset.h"
30#include "unicode/unistr.h"
31#include "unicode/unorm2.h"
32
33U_NAMESPACE_BEGIN
34
35/**
36 * Unicode normalization functionality for standard Unicode normalization or
37 * for using custom mapping tables.
38 * All instances of this class are unmodifiable/immutable.
39 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
40 * The Normalizer2 class is not intended for public subclassing.
41 *
42 * The primary functions are to produce a normalized string and to detect whether
43 * a string is already normalized.
44 * The most commonly used normalization forms are those defined in
45 * http://www.unicode.org/unicode/reports/tr15/
46 * However, this API supports additional normalization forms for specialized purposes.
47 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
48 * and can be used in implementations of UTS #46.
49 *
50 * Not only are the standard compose and decompose modes supplied,
51 * but additional modes are provided as documented in the Mode enum.
52 *
53 * Some of the functions in this class identify normalization boundaries.
54 * At a normalization boundary, the portions of the string
55 * before it and starting from it do not interact and can be handled independently.
56 *
57 * The spanQuickCheckYes() stops at a normalization boundary.
58 * When the goal is a normalized string, then the text before the boundary
59 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
60 *
61 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
62 * a character is guaranteed to be at a normalization boundary,
63 * regardless of context.
64 * This is used for moving from one normalization boundary to the next
65 * or preceding boundary, and for performing iterative normalization.
66 *
67 * Iterative normalization is useful when only a small portion of a
68 * longer string needs to be processed.
69 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
70 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
71 * (to process only the substring for which sort key bytes are computed).
72 *
73 * The set of normalization boundaries returned by these functions may not be
74 * complete: There may be more boundaries that could be returned.
75 * Different functions may return different boundaries.
76 * @stable ICU 4.4
77 */
78class U_COMMON_API Normalizer2 : public UObject {
79public:
80    /**
81     * Returns a Normalizer2 instance which uses the specified data file
82     * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
83     * and which composes or decomposes text according to the specified mode.
84     * Returns an unmodifiable singleton instance. Do not delete it.
85     *
86     * Use packageName=NULL for data files that are part of ICU's own data.
87     * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
88     * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
89     * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
90     *
91     * @param packageName NULL for ICU built-in data, otherwise application data package name
92     * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
93     * @param mode normalization mode (compose or decompose etc.)
94     * @param errorCode Standard ICU error code. Its input value must
95     *                  pass the U_SUCCESS() test, or else the function returns
96     *                  immediately. Check for U_FAILURE() on output or use with
97     *                  function chaining. (See User Guide for details.)
98     * @return the requested Normalizer2, if successful
99     * @stable ICU 4.4
100     */
101    static const Normalizer2 *
102    getInstance(const char *packageName,
103                const char *name,
104                UNormalization2Mode mode,
105                UErrorCode &errorCode);
106
107    /**
108     * Returns the normalized form of the source string.
109     * @param src source string
110     * @param errorCode Standard ICU error code. Its input value must
111     *                  pass the U_SUCCESS() test, or else the function returns
112     *                  immediately. Check for U_FAILURE() on output or use with
113     *                  function chaining. (See User Guide for details.)
114     * @return normalized src
115     * @stable ICU 4.4
116     */
117    UnicodeString
118    normalize(const UnicodeString &src, UErrorCode &errorCode) const {
119        UnicodeString result;
120        normalize(src, result, errorCode);
121        return result;
122    }
123    /**
124     * Writes the normalized form of the source string to the destination string
125     * (replacing its contents) and returns the destination string.
126     * The source and destination strings must be different objects.
127     * @param src source string
128     * @param dest destination string; its contents is replaced with normalized src
129     * @param errorCode Standard ICU error code. Its input value must
130     *                  pass the U_SUCCESS() test, or else the function returns
131     *                  immediately. Check for U_FAILURE() on output or use with
132     *                  function chaining. (See User Guide for details.)
133     * @return dest
134     * @stable ICU 4.4
135     */
136    virtual UnicodeString &
137    normalize(const UnicodeString &src,
138              UnicodeString &dest,
139              UErrorCode &errorCode) const = 0;
140    /**
141     * Appends the normalized form of the second string to the first string
142     * (merging them at the boundary) and returns the first string.
143     * The result is normalized if the first string was normalized.
144     * The first and second strings must be different objects.
145     * @param first string, should be normalized
146     * @param second string, will be normalized
147     * @param errorCode Standard ICU error code. Its input value must
148     *                  pass the U_SUCCESS() test, or else the function returns
149     *                  immediately. Check for U_FAILURE() on output or use with
150     *                  function chaining. (See User Guide for details.)
151     * @return first
152     * @stable ICU 4.4
153     */
154    virtual UnicodeString &
155    normalizeSecondAndAppend(UnicodeString &first,
156                             const UnicodeString &second,
157                             UErrorCode &errorCode) const = 0;
158    /**
159     * Appends the second string to the first string
160     * (merging them at the boundary) and returns the first string.
161     * The result is normalized if both the strings were normalized.
162     * The first and second strings must be different objects.
163     * @param first string, should be normalized
164     * @param second string, should be normalized
165     * @param errorCode Standard ICU error code. Its input value must
166     *                  pass the U_SUCCESS() test, or else the function returns
167     *                  immediately. Check for U_FAILURE() on output or use with
168     *                  function chaining. (See User Guide for details.)
169     * @return first
170     * @stable ICU 4.4
171     */
172    virtual UnicodeString &
173    append(UnicodeString &first,
174           const UnicodeString &second,
175           UErrorCode &errorCode) const = 0;
176
177    /**
178     * Gets the decomposition mapping of c.
179     * Roughly equivalent to normalizing the String form of c
180     * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
181     * returns FALSE and does not write a string
182     * if c does not have a decomposition mapping in this instance's data.
183     * This function is independent of the mode of the Normalizer2.
184     * @param c code point
185     * @param decomposition String object which will be set to c's
186     *                      decomposition mapping, if there is one.
187     * @return TRUE if c has a decomposition, otherwise FALSE
188     * @draft ICU 4.6
189     */
190    virtual UBool
191    getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
192
193    /**
194     * Tests if the string is normalized.
195     * Internally, in cases where the quickCheck() method would return "maybe"
196     * (which is only possible for the two COMPOSE modes) this method
197     * resolves to "yes" or "no" to provide a definitive result,
198     * at the cost of doing more work in those cases.
199     * @param s input string
200     * @param errorCode Standard ICU error code. Its input value must
201     *                  pass the U_SUCCESS() test, or else the function returns
202     *                  immediately. Check for U_FAILURE() on output or use with
203     *                  function chaining. (See User Guide for details.)
204     * @return TRUE if s is normalized
205     * @stable ICU 4.4
206     */
207    virtual UBool
208    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
209
210    /**
211     * Tests if the string is normalized.
212     * For the two COMPOSE modes, the result could be "maybe" in cases that
213     * would take a little more work to resolve definitively.
214     * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
215     * combination of quick check + normalization, to avoid
216     * re-checking the "yes" prefix.
217     * @param s input string
218     * @param errorCode Standard ICU error code. Its input value must
219     *                  pass the U_SUCCESS() test, or else the function returns
220     *                  immediately. Check for U_FAILURE() on output or use with
221     *                  function chaining. (See User Guide for details.)
222     * @return UNormalizationCheckResult
223     * @stable ICU 4.4
224     */
225    virtual UNormalizationCheckResult
226    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
227
228    /**
229     * Returns the end of the normalized substring of the input string.
230     * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
231     * the substring <code>UnicodeString(s, 0, end)</code>
232     * will pass the quick check with a "yes" result.
233     *
234     * The returned end index is usually one or more characters before the
235     * "no" or "maybe" character: The end index is at a normalization boundary.
236     * (See the class documentation for more about normalization boundaries.)
237     *
238     * When the goal is a normalized string and most input strings are expected
239     * to be normalized already, then call this method,
240     * and if it returns a prefix shorter than the input string,
241     * copy that prefix and use normalizeSecondAndAppend() for the remainder.
242     * @param s input string
243     * @param errorCode Standard ICU error code. Its input value must
244     *                  pass the U_SUCCESS() test, or else the function returns
245     *                  immediately. Check for U_FAILURE() on output or use with
246     *                  function chaining. (See User Guide for details.)
247     * @return "yes" span end index
248     * @stable ICU 4.4
249     */
250    virtual int32_t
251    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
252
253    /**
254     * Tests if the character always has a normalization boundary before it,
255     * regardless of context.
256     * If true, then the character does not normalization-interact with
257     * preceding characters.
258     * In other words, a string containing this character can be normalized
259     * by processing portions before this character and starting from this
260     * character independently.
261     * This is used for iterative normalization. See the class documentation for details.
262     * @param c character to test
263     * @return TRUE if c has a normalization boundary before it
264     * @stable ICU 4.4
265     */
266    virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
267
268    /**
269     * Tests if the character always has a normalization boundary after it,
270     * regardless of context.
271     * If true, then the character does not normalization-interact with
272     * following characters.
273     * In other words, a string containing this character can be normalized
274     * by processing portions up to this character and after this
275     * character independently.
276     * This is used for iterative normalization. See the class documentation for details.
277     * Note that this operation may be significantly slower than hasBoundaryBefore().
278     * @param c character to test
279     * @return TRUE if c has a normalization boundary after it
280     * @stable ICU 4.4
281     */
282    virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
283
284    /**
285     * Tests if the character is normalization-inert.
286     * If true, then the character does not change, nor normalization-interact with
287     * preceding or following characters.
288     * In other words, a string containing this character can be normalized
289     * by processing portions before this character and after this
290     * character independently.
291     * This is used for iterative normalization. See the class documentation for details.
292     * Note that this operation may be significantly slower than hasBoundaryBefore().
293     * @param c character to test
294     * @return TRUE if c is normalization-inert
295     * @stable ICU 4.4
296     */
297    virtual UBool isInert(UChar32 c) const = 0;
298
299private:
300    // No ICU "poor man's RTTI" for this class nor its subclasses.
301    virtual UClassID getDynamicClassID() const;
302};
303
304/**
305 * Normalization filtered by a UnicodeSet.
306 * Normalizes portions of the text contained in the filter set and leaves
307 * portions not contained in the filter set unchanged.
308 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
309 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
310 * This class implements all of (and only) the Normalizer2 API.
311 * An instance of this class is unmodifiable/immutable but is constructed and
312 * must be destructed by the owner.
313 * @stable ICU 4.4
314 */
315class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
316public:
317    /**
318     * Constructs a filtered normalizer wrapping any Normalizer2 instance
319     * and a filter set.
320     * Both are aliased and must not be modified or deleted while this object
321     * is used.
322     * The filter set should be frozen; otherwise the performance will suffer greatly.
323     * @param n2 wrapped Normalizer2 instance
324     * @param filterSet UnicodeSet which determines the characters to be normalized
325     * @stable ICU 4.4
326     */
327    FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
328            norm2(n2), set(filterSet) {}
329
330    /**
331     * Writes the normalized form of the source string to the destination string
332     * (replacing its contents) and returns the destination string.
333     * The source and destination strings must be different objects.
334     * @param src source string
335     * @param dest destination string; its contents is replaced with normalized src
336     * @param errorCode Standard ICU error code. Its input value must
337     *                  pass the U_SUCCESS() test, or else the function returns
338     *                  immediately. Check for U_FAILURE() on output or use with
339     *                  function chaining. (See User Guide for details.)
340     * @return dest
341     * @stable ICU 4.4
342     */
343    virtual UnicodeString &
344    normalize(const UnicodeString &src,
345              UnicodeString &dest,
346              UErrorCode &errorCode) const;
347    /**
348     * Appends the normalized form of the second string to the first string
349     * (merging them at the boundary) and returns the first string.
350     * The result is normalized if the first string was normalized.
351     * The first and second strings must be different objects.
352     * @param first string, should be normalized
353     * @param second string, will be normalized
354     * @param errorCode Standard ICU error code. Its input value must
355     *                  pass the U_SUCCESS() test, or else the function returns
356     *                  immediately. Check for U_FAILURE() on output or use with
357     *                  function chaining. (See User Guide for details.)
358     * @return first
359     * @stable ICU 4.4
360     */
361    virtual UnicodeString &
362    normalizeSecondAndAppend(UnicodeString &first,
363                             const UnicodeString &second,
364                             UErrorCode &errorCode) const;
365    /**
366     * Appends the second string to the first string
367     * (merging them at the boundary) and returns the first string.
368     * The result is normalized if both the strings were normalized.
369     * The first and second strings must be different objects.
370     * @param first string, should be normalized
371     * @param second string, should be normalized
372     * @param errorCode Standard ICU error code. Its input value must
373     *                  pass the U_SUCCESS() test, or else the function returns
374     *                  immediately. Check for U_FAILURE() on output or use with
375     *                  function chaining. (See User Guide for details.)
376     * @return first
377     * @stable ICU 4.4
378     */
379    virtual UnicodeString &
380    append(UnicodeString &first,
381           const UnicodeString &second,
382           UErrorCode &errorCode) const;
383
384    /**
385     * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
386     * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
387     * This function is independent of the mode of the Normalizer2.
388     * @param c code point
389     * @param decomposition String object which will be set to c's
390     *                      decomposition mapping, if there is one.
391     * @return TRUE if c has a decomposition, otherwise FALSE
392     * @draft ICU 4.6
393     */
394    virtual UBool
395    getDecomposition(UChar32 c, UnicodeString &decomposition) const;
396
397    /**
398     * Tests if the string is normalized.
399     * For details see the Normalizer2 base class documentation.
400     * @param s input string
401     * @param errorCode Standard ICU error code. Its input value must
402     *                  pass the U_SUCCESS() test, or else the function returns
403     *                  immediately. Check for U_FAILURE() on output or use with
404     *                  function chaining. (See User Guide for details.)
405     * @return TRUE if s is normalized
406     * @stable ICU 4.4
407     */
408    virtual UBool
409    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
410    /**
411     * Tests if the string is normalized.
412     * For details see the Normalizer2 base class documentation.
413     * @param s input string
414     * @param errorCode Standard ICU error code. Its input value must
415     *                  pass the U_SUCCESS() test, or else the function returns
416     *                  immediately. Check for U_FAILURE() on output or use with
417     *                  function chaining. (See User Guide for details.)
418     * @return UNormalizationCheckResult
419     * @stable ICU 4.4
420     */
421    virtual UNormalizationCheckResult
422    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
423    /**
424     * Returns the end of the normalized substring of the input string.
425     * For details see the Normalizer2 base class documentation.
426     * @param s input string
427     * @param errorCode Standard ICU error code. Its input value must
428     *                  pass the U_SUCCESS() test, or else the function returns
429     *                  immediately. Check for U_FAILURE() on output or use with
430     *                  function chaining. (See User Guide for details.)
431     * @return "yes" span end index
432     * @stable ICU 4.4
433     */
434    virtual int32_t
435    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
436
437    /**
438     * Tests if the character always has a normalization boundary before it,
439     * regardless of context.
440     * For details see the Normalizer2 base class documentation.
441     * @param c character to test
442     * @return TRUE if c has a normalization boundary before it
443     * @stable ICU 4.4
444     */
445    virtual UBool hasBoundaryBefore(UChar32 c) const;
446
447    /**
448     * Tests if the character always has a normalization boundary after it,
449     * regardless of context.
450     * For details see the Normalizer2 base class documentation.
451     * @param c character to test
452     * @return TRUE if c has a normalization boundary after it
453     * @stable ICU 4.4
454     */
455    virtual UBool hasBoundaryAfter(UChar32 c) const;
456
457    /**
458     * Tests if the character is normalization-inert.
459     * For details see the Normalizer2 base class documentation.
460     * @param c character to test
461     * @return TRUE if c is normalization-inert
462     * @stable ICU 4.4
463     */
464    virtual UBool isInert(UChar32 c) const;
465private:
466    UnicodeString &
467    normalize(const UnicodeString &src,
468              UnicodeString &dest,
469              USetSpanCondition spanCondition,
470              UErrorCode &errorCode) const;
471
472    UnicodeString &
473    normalizeSecondAndAppend(UnicodeString &first,
474                             const UnicodeString &second,
475                             UBool doNormalize,
476                             UErrorCode &errorCode) const;
477
478    const Normalizer2 &norm2;
479    const UnicodeSet &set;
480};
481
482U_NAMESPACE_END
483
484#endif  // !UCONFIG_NO_NORMALIZATION
485#endif  // __NORMALIZER2_H__
486