1/*
2**********************************************************************
3*   Copyright (C) 2001-2007, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   07/03/01    aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uniset.h"
16#include "unicode/uiter.h"
17#include "nortrans.h"
18#include "unormimp.h"
19#include "ucln_in.h"
20
21U_NAMESPACE_BEGIN
22
23UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
24
25/**
26 * System registration hook.
27 */
28void NormalizationTransliterator::registerIDs() {
29    UErrorCode errorCode = U_ZERO_ERROR;
30    if(!unorm_haveData(&errorCode)) {
31        return;
32    }
33
34    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
35                                     _create, integerToken(UNORM_NFC));
36    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
37                                     _create, integerToken(UNORM_NFKC));
38    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
39                                     _create, integerToken(UNORM_NFD));
40    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
41                                     _create, integerToken(UNORM_NFKD));
42    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
43                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
44    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
45                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
46}
47
48/**
49 * Factory methods
50 */
51Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
52                                                     Token context) {
53    return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
54}
55
56/**
57 * Constructs a transliterator.
58 */
59NormalizationTransliterator::NormalizationTransliterator(
60                                 const UnicodeString& id,
61                                 UNormalizationMode mode, int32_t opt) :
62    Transliterator(id, 0) {
63    fMode = mode;
64    options = opt;
65}
66
67/**
68 * Destructor.
69 */
70NormalizationTransliterator::~NormalizationTransliterator() {
71}
72
73/**
74 * Copy constructor.
75 */
76NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
77Transliterator(o) {
78    fMode = o.fMode;
79    options = o.options;
80}
81
82/**
83 * Assignment operator.
84 */
85/*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
86    Transliterator::operator=(o);
87    fMode = o.fMode;
88    options = o.options;
89    return *this;
90}*/
91
92/**
93 * Transliterator API.
94 */
95Transliterator* NormalizationTransliterator::clone(void) const {
96    return new NormalizationTransliterator(*this);
97}
98
99/**
100 * Implements {@link Transliterator#handleTransliterate}.
101 */
102void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103                                                      UBool isIncremental) const {
104    // start and limit of the input range
105    int32_t start = offsets.start;
106    int32_t limit = offsets.limit;
107    int32_t length, delta;
108
109    if(start >= limit) {
110        return;
111    }
112
113    // a C code unit iterator, implemented around the Replaceable
114    UCharIterator iter;
115    uiter_setReplaceable(&iter, &text);
116
117    // the output string and buffer pointer
118    UnicodeString output;
119    UChar *buffer;
120    UBool neededToNormalize;
121
122    UErrorCode errorCode;
123
124    /*
125     * Normalize as short chunks at a time as possible even in
126     * bulk mode, so that styled text is minimally disrupted.
127     * In incremental mode, a chunk that ends with offsets.limit
128     * must not be normalized.
129     *
130     * If it was known that the input text is not styled, then
131     * a bulk mode normalization could look like this:
132     *
133
134    UChar staticChars[256];
135    UnicodeString input;
136
137    length = limit - start;
138    input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
139
140    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
141    input.releaseBuffer(length);
142
143    UErrorCode status = U_ZERO_ERROR;
144    Normalizer::normalize(input, fMode, options, output, status);
145
146    text.handleReplaceBetween(start, limit, output);
147
148    int32_t delta = output.length() - length;
149    offsets.contextLimit += delta;
150    offsets.limit += delta;
151    offsets.start = limit + delta;
152
153     *
154     */
155    while(start < limit) {
156        // set the iterator limits for the remaining input range
157        // this is a moving target because of the replacements in the text object
158        iter.start = iter.index = start;
159        iter.limit = limit;
160
161        // incrementally normalize a small chunk of the input
162        buffer = output.getBuffer(-1);
163        errorCode = U_ZERO_ERROR;
164        length = unorm_next(&iter, buffer, output.getCapacity(),
165                            fMode, 0,
166                            TRUE, &neededToNormalize,
167                            &errorCode);
168        output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
169
170        if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
171            // use a larger output string buffer and do it again from the start
172            iter.index = start;
173            buffer = output.getBuffer(length);
174            errorCode = U_ZERO_ERROR;
175            length = unorm_next(&iter, buffer, output.getCapacity(),
176                                fMode, 0,
177                                TRUE, &neededToNormalize,
178                                &errorCode);
179            output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
180        }
181
182        if(U_FAILURE(errorCode)) {
183            break;
184        }
185
186        limit = iter.index;
187        if(isIncremental && limit == iter.limit) {
188            // stop in incremental mode when we reach the input limit
189            // in case there are additional characters that could change the
190            // normalization result
191
192            // UNLESS all characters in the result of the normalization of
193            // the last run are in the skippable set
194            const UChar *s=output.getBuffer();
195            int32_t i=0, outLength=output.length();
196            UChar32 c;
197
198            while(i<outLength) {
199                U16_NEXT(s, i, outLength, c);
200                if(!unorm_isNFSkippable(c, fMode)) {
201                    outLength=-1; // I wish C++ had labeled loops and break outer; ...
202                    break;
203                }
204            }
205            if (outLength<0) {
206                break;
207            }
208        }
209
210        if(neededToNormalize) {
211            // replace the input chunk with its normalized form
212            text.handleReplaceBetween(start, limit, output);
213
214            // update all necessary indexes accordingly
215            delta = length - (limit - start);   // length change in the text object
216            start = limit += delta;             // the next chunk starts where this one ends, with adjustment
217            limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
218            offsets.contextLimit += delta;
219        } else {
220            // delta == 0
221            start = limit;
222            limit = offsets.limit;
223        }
224    }
225
226    offsets.start = start;
227}
228
229U_NAMESPACE_END
230
231#endif /* #if !UCONFIG_NO_TRANSLITERATION */
232