1/*
2**********************************************************************
3*   Copyright (C) 2001-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   07/03/01    aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/normalizer2.h"
16#include "unicode/utf16.h"
17#include "cstring.h"
18#include "nortrans.h"
19
20U_NAMESPACE_BEGIN
21
22UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
23
24static inline Transliterator::Token cstrToken(const char *s) {
25    return Transliterator::pointerToken((void *)s);
26}
27
28/**
29 * System registration hook.
30 */
31void NormalizationTransliterator::registerIDs() {
32    // In the Token, the byte after the NUL is the UNormalization2Mode.
33    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
34                                     _create, cstrToken("nfc\0\0"));
35    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
36                                     _create, cstrToken("nfkc\0\0"));
37    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
38                                     _create, cstrToken("nfc\0\1"));
39    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
40                                     _create, cstrToken("nfkc\0\1"));
41    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
42                                     _create, cstrToken("nfc\0\2"));
43    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
44                                     _create, cstrToken("nfc\0\3"));
45    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
46                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
47    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
48                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
49    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
50                                            UNICODE_STRING_SIMPLE("NFD"), FALSE);
51    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
52                                            UNICODE_STRING_SIMPLE("FCD"), FALSE);
53}
54
55/**
56 * Factory methods
57 */
58Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
59                                                     Token context) {
60    const char *name = (const char *)context.pointer;
61    UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
62    UErrorCode errorCode = U_ZERO_ERROR;
63    const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
64    if(U_SUCCESS(errorCode)) {
65        return new NormalizationTransliterator(ID, *norm2);
66    } else {
67        return NULL;
68    }
69}
70
71/**
72 * Constructs a transliterator.
73 */
74NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
75                                                         const Normalizer2 &norm2) :
76    Transliterator(id, 0), fNorm2(norm2) {}
77
78/**
79 * Destructor.
80 */
81NormalizationTransliterator::~NormalizationTransliterator() {
82}
83
84/**
85 * Copy constructor.
86 */
87NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
88    Transliterator(o), fNorm2(o.fNorm2) {}
89
90/**
91 * Transliterator API.
92 */
93Transliterator* NormalizationTransliterator::clone(void) const {
94    return new NormalizationTransliterator(*this);
95}
96
97/**
98 * Implements {@link Transliterator#handleTransliterate}.
99 */
100void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
101                                                      UBool isIncremental) const {
102    // start and limit of the input range
103    int32_t start = offsets.start;
104    int32_t limit = offsets.limit;
105    if(start >= limit) {
106        return;
107    }
108
109    /*
110     * Normalize as short chunks at a time as possible even in
111     * bulk mode, so that styled text is minimally disrupted.
112     * In incremental mode, a chunk that ends with offsets.limit
113     * must not be normalized.
114     *
115     * If it was known that the input text is not styled, then
116     * a bulk mode normalization could look like this:
117
118    UnicodeString input, normalized;
119    int32_t length = limit - start;
120    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
121    input.releaseBuffer(length);
122
123    UErrorCode status = U_ZERO_ERROR;
124    fNorm2.normalize(input, normalized, status);
125
126    text.handleReplaceBetween(start, limit, normalized);
127
128    int32_t delta = normalized.length() - length;
129    offsets.contextLimit += delta;
130    offsets.limit += delta;
131    offsets.start = limit + delta;
132
133     */
134    UErrorCode errorCode = U_ZERO_ERROR;
135    UnicodeString segment;
136    UnicodeString normalized;
137    UChar32 c = text.char32At(start);
138    do {
139        int32_t prev = start;
140        // Skip at least one character so we make progress.
141        // c holds the character at start.
142        segment.remove();
143        do {
144            segment.append(c);
145            start += U16_LENGTH(c);
146        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
147        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
148            // stop in incremental mode when we reach the input limit
149            // in case there are additional characters that could change the
150            // normalization result
151            start=prev;
152            break;
153        }
154        fNorm2.normalize(segment, normalized, errorCode);
155        if(U_FAILURE(errorCode)) {
156            break;
157        }
158        if(segment != normalized) {
159            // replace the input chunk with its normalized form
160            text.handleReplaceBetween(prev, start, normalized);
161
162            // update all necessary indexes accordingly
163            int32_t delta = normalized.length() - (start - prev);
164            start += delta;
165            limit += delta;
166        }
167    } while(start < limit);
168
169    offsets.start = start;
170    offsets.contextLimit += limit - offsets.limit;
171    offsets.limit = limit;
172}
173
174U_NAMESPACE_END
175
176#endif /* #if !UCONFIG_NO_TRANSLITERATION */
177