1/*
2 *************************************************************************
3 * COPYRIGHT:
4 * Copyright (c) 1996-2010, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 *************************************************************************
7 */
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_NORMALIZATION
12
13#include "unicode/uniset.h"
14#include "unicode/unistr.h"
15#include "unicode/chariter.h"
16#include "unicode/schriter.h"
17#include "unicode/uchriter.h"
18#include "unicode/normlzr.h"
19#include "cmemory.h"
20#include "normalizer2impl.h"
21#include "uprops.h"  // for uniset_getUnicode32Instance()
22
23U_NAMESPACE_BEGIN
24
25UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
26
27//-------------------------------------------------------------------------
28// Constructors and other boilerplate
29//-------------------------------------------------------------------------
30
31Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
32    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
33    text(new StringCharacterIterator(str)),
34    currentIndex(0), nextIndex(0),
35    buffer(), bufferPos(0)
36{
37    init();
38}
39
40Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
41    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
42    text(new UCharCharacterIterator(str, length)),
43    currentIndex(0), nextIndex(0),
44    buffer(), bufferPos(0)
45{
46    init();
47}
48
49Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
50    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
51    text(iter.clone()),
52    currentIndex(0), nextIndex(0),
53    buffer(), bufferPos(0)
54{
55    init();
56}
57
58Normalizer::Normalizer(const Normalizer &copy) :
59    UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
60    text(copy.text->clone()),
61    currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
62    buffer(copy.buffer), bufferPos(copy.bufferPos)
63{
64    init();
65}
66
67static const UChar _NUL=0;
68
69void
70Normalizer::init() {
71    UErrorCode errorCode=U_ZERO_ERROR;
72    fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
73    if(fOptions&UNORM_UNICODE_3_2) {
74        delete fFilteredNorm2;
75        fNorm2=fFilteredNorm2=
76            new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
77    }
78    if(U_FAILURE(errorCode)) {
79        errorCode=U_ZERO_ERROR;
80        fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
81    }
82}
83
84Normalizer::~Normalizer()
85{
86    delete fFilteredNorm2;
87    delete text;
88}
89
90Normalizer*
91Normalizer::clone() const
92{
93    return new Normalizer(*this);
94}
95
96/**
97 * Generates a hash code for this iterator.
98 */
99int32_t Normalizer::hashCode() const
100{
101    return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
102}
103
104UBool Normalizer::operator==(const Normalizer& that) const
105{
106    return
107        this==&that ||
108        (fUMode==that.fUMode &&
109        fOptions==that.fOptions &&
110        *text==*that.text &&
111        buffer==that.buffer &&
112        bufferPos==that.bufferPos &&
113        nextIndex==that.nextIndex);
114}
115
116//-------------------------------------------------------------------------
117// Static utility methods
118//-------------------------------------------------------------------------
119
120void U_EXPORT2
121Normalizer::normalize(const UnicodeString& source,
122                      UNormalizationMode mode, int32_t options,
123                      UnicodeString& result,
124                      UErrorCode &status) {
125    if(source.isBogus() || U_FAILURE(status)) {
126        result.setToBogus();
127        if(U_SUCCESS(status)) {
128            status=U_ILLEGAL_ARGUMENT_ERROR;
129        }
130    } else {
131        UnicodeString localDest;
132        UnicodeString *dest;
133
134        if(&source!=&result) {
135            dest=&result;
136        } else {
137            // the source and result strings are the same object, use a temporary one
138            dest=&localDest;
139        }
140        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
141        if(U_SUCCESS(status)) {
142            if(options&UNORM_UNICODE_3_2) {
143                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
144                    normalize(source, *dest, status);
145            } else {
146                n2->normalize(source, *dest, status);
147            }
148        }
149        if(dest==&localDest && U_SUCCESS(status)) {
150            result=*dest;
151        }
152    }
153}
154
155void U_EXPORT2
156Normalizer::compose(const UnicodeString& source,
157                    UBool compat, int32_t options,
158                    UnicodeString& result,
159                    UErrorCode &status) {
160    normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
161}
162
163void U_EXPORT2
164Normalizer::decompose(const UnicodeString& source,
165                      UBool compat, int32_t options,
166                      UnicodeString& result,
167                      UErrorCode &status) {
168    normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
169}
170
171UNormalizationCheckResult
172Normalizer::quickCheck(const UnicodeString& source,
173                       UNormalizationMode mode, int32_t options,
174                       UErrorCode &status) {
175    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
176    if(U_SUCCESS(status)) {
177        if(options&UNORM_UNICODE_3_2) {
178            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
179                quickCheck(source, status);
180        } else {
181            return n2->quickCheck(source, status);
182        }
183    } else {
184        return UNORM_MAYBE;
185    }
186}
187
188UBool
189Normalizer::isNormalized(const UnicodeString& source,
190                         UNormalizationMode mode, int32_t options,
191                         UErrorCode &status) {
192    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
193    if(U_SUCCESS(status)) {
194        if(options&UNORM_UNICODE_3_2) {
195            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
196                isNormalized(source, status);
197        } else {
198            return n2->isNormalized(source, status);
199        }
200    } else {
201        return FALSE;
202    }
203}
204
205UnicodeString & U_EXPORT2
206Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
207                        UnicodeString &result,
208                        UNormalizationMode mode, int32_t options,
209                        UErrorCode &errorCode) {
210    if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
211        result.setToBogus();
212        if(U_SUCCESS(errorCode)) {
213            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
214        }
215    } else {
216        UnicodeString localDest;
217        UnicodeString *dest;
218
219        if(&right!=&result) {
220            dest=&result;
221        } else {
222            // the right and result strings are the same object, use a temporary one
223            dest=&localDest;
224        }
225        *dest=left;
226        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
227        if(U_SUCCESS(errorCode)) {
228            if(options&UNORM_UNICODE_3_2) {
229                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
230                    append(*dest, right, errorCode);
231            } else {
232                n2->append(*dest, right, errorCode);
233            }
234        }
235        if(dest==&localDest && U_SUCCESS(errorCode)) {
236            result=*dest;
237        }
238    }
239    return result;
240}
241
242//-------------------------------------------------------------------------
243// Iteration API
244//-------------------------------------------------------------------------
245
246/**
247 * Return the current character in the normalized text.
248 */
249UChar32 Normalizer::current() {
250    if(bufferPos<buffer.length() || nextNormalize()) {
251        return buffer.char32At(bufferPos);
252    } else {
253        return DONE;
254    }
255}
256
257/**
258 * Return the next character in the normalized text and advance
259 * the iteration position by one.  If the end
260 * of the text has already been reached, {@link #DONE} is returned.
261 */
262UChar32 Normalizer::next() {
263    if(bufferPos<buffer.length() ||  nextNormalize()) {
264        UChar32 c=buffer.char32At(bufferPos);
265        bufferPos+=UTF_CHAR_LENGTH(c);
266        return c;
267    } else {
268        return DONE;
269    }
270}
271
272/**
273 * Return the previous character in the normalized text and decrement
274 * the iteration position by one.  If the beginning
275 * of the text has already been reached, {@link #DONE} is returned.
276 */
277UChar32 Normalizer::previous() {
278    if(bufferPos>0 || previousNormalize()) {
279        UChar32 c=buffer.char32At(bufferPos-1);
280        bufferPos-=UTF_CHAR_LENGTH(c);
281        return c;
282    } else {
283        return DONE;
284    }
285}
286
287void Normalizer::reset() {
288    currentIndex=nextIndex=text->setToStart();
289    clearBuffer();
290}
291
292void
293Normalizer::setIndexOnly(int32_t index) {
294    text->setIndex(index);  // pins index
295    currentIndex=nextIndex=text->getIndex();
296    clearBuffer();
297}
298
299/**
300 * Return the first character in the normalized text.  This resets
301 * the <tt>Normalizer's</tt> position to the beginning of the text.
302 */
303UChar32 Normalizer::first() {
304    reset();
305    return next();
306}
307
308/**
309 * Return the last character in the normalized text.  This resets
310 * the <tt>Normalizer's</tt> position to be just before the
311 * the input text corresponding to that normalized character.
312 */
313UChar32 Normalizer::last() {
314    currentIndex=nextIndex=text->setToEnd();
315    clearBuffer();
316    return previous();
317}
318
319/**
320 * Retrieve the current iteration position in the input text that is
321 * being normalized.  This method is useful in applications such as
322 * searching, where you need to be able to determine the position in
323 * the input text that corresponds to a given normalized output character.
324 * <p>
325 * <b>Note:</b> This method sets the position in the <em>input</em>, while
326 * {@link #next} and {@link #previous} iterate through characters in the
327 * <em>output</em>.  This means that there is not necessarily a one-to-one
328 * correspondence between characters returned by <tt>next</tt> and
329 * <tt>previous</tt> and the indices passed to and returned from
330 * <tt>setIndex</tt> and {@link #getIndex}.
331 *
332 */
333int32_t Normalizer::getIndex() const {
334    if(bufferPos<buffer.length()) {
335        return currentIndex;
336    } else {
337        return nextIndex;
338    }
339}
340
341/**
342 * Retrieve the index of the start of the input text.  This is the begin index
343 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
344 * over which this <tt>Normalizer</tt> is iterating
345 */
346int32_t Normalizer::startIndex() const {
347    return text->startIndex();
348}
349
350/**
351 * Retrieve the index of the end of the input text.  This is the end index
352 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
353 * over which this <tt>Normalizer</tt> is iterating
354 */
355int32_t Normalizer::endIndex() const {
356    return text->endIndex();
357}
358
359//-------------------------------------------------------------------------
360// Property access methods
361//-------------------------------------------------------------------------
362
363void
364Normalizer::setMode(UNormalizationMode newMode)
365{
366    fUMode = newMode;
367    init();
368}
369
370UNormalizationMode
371Normalizer::getUMode() const
372{
373    return fUMode;
374}
375
376void
377Normalizer::setOption(int32_t option,
378                      UBool value)
379{
380    if (value) {
381        fOptions |= option;
382    } else {
383        fOptions &= (~option);
384    }
385    init();
386}
387
388UBool
389Normalizer::getOption(int32_t option) const
390{
391    return (fOptions & option) != 0;
392}
393
394/**
395 * Set the input text over which this <tt>Normalizer</tt> will iterate.
396 * The iteration position is set to the beginning of the input text.
397 */
398void
399Normalizer::setText(const UnicodeString& newText,
400                    UErrorCode &status)
401{
402    if (U_FAILURE(status)) {
403        return;
404    }
405    CharacterIterator *newIter = new StringCharacterIterator(newText);
406    if (newIter == NULL) {
407        status = U_MEMORY_ALLOCATION_ERROR;
408        return;
409    }
410    delete text;
411    text = newIter;
412    reset();
413}
414
415/**
416 * Set the input text over which this <tt>Normalizer</tt> will iterate.
417 * The iteration position is set to the beginning of the string.
418 */
419void
420Normalizer::setText(const CharacterIterator& newText,
421                    UErrorCode &status)
422{
423    if (U_FAILURE(status)) {
424        return;
425    }
426    CharacterIterator *newIter = newText.clone();
427    if (newIter == NULL) {
428        status = U_MEMORY_ALLOCATION_ERROR;
429        return;
430    }
431    delete text;
432    text = newIter;
433    reset();
434}
435
436void
437Normalizer::setText(const UChar* newText,
438                    int32_t length,
439                    UErrorCode &status)
440{
441    if (U_FAILURE(status)) {
442        return;
443    }
444    CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
445    if (newIter == NULL) {
446        status = U_MEMORY_ALLOCATION_ERROR;
447        return;
448    }
449    delete text;
450    text = newIter;
451    reset();
452}
453
454/**
455 * Copies the text under iteration into the UnicodeString referred to by "result".
456 * @param result Receives a copy of the text under iteration.
457 */
458void
459Normalizer::getText(UnicodeString&  result)
460{
461    text->getText(result);
462}
463
464//-------------------------------------------------------------------------
465// Private utility methods
466//-------------------------------------------------------------------------
467
468void Normalizer::clearBuffer() {
469    buffer.remove();
470    bufferPos=0;
471}
472
473UBool
474Normalizer::nextNormalize() {
475    clearBuffer();
476    currentIndex=nextIndex;
477    text->setIndex(nextIndex);
478    if(!text->hasNext()) {
479        return FALSE;
480    }
481    // Skip at least one character so we make progress.
482    UnicodeString segment(text->next32PostInc());
483    while(text->hasNext()) {
484        UChar32 c;
485        if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
486            text->move32(-1, CharacterIterator::kCurrent);
487            break;
488        }
489        segment.append(c);
490    }
491    nextIndex=text->getIndex();
492    UErrorCode errorCode=U_ZERO_ERROR;
493    fNorm2->normalize(segment, buffer, errorCode);
494    return U_SUCCESS(errorCode) && !buffer.isEmpty();
495}
496
497UBool
498Normalizer::previousNormalize() {
499    clearBuffer();
500    nextIndex=currentIndex;
501    text->setIndex(currentIndex);
502    if(!text->hasPrevious()) {
503        return FALSE;
504    }
505    UnicodeString segment;
506    while(text->hasPrevious()) {
507        UChar32 c=text->previous32();
508        segment.insert(0, c);
509        if(fNorm2->hasBoundaryBefore(c)) {
510            break;
511        }
512    }
513    currentIndex=text->getIndex();
514    UErrorCode errorCode=U_ZERO_ERROR;
515    fNorm2->normalize(segment, buffer, errorCode);
516    bufferPos=buffer.length();
517    return U_SUCCESS(errorCode) && !buffer.isEmpty();
518}
519
520U_NAMESPACE_END
521
522#endif /* #if !UCONFIG_NO_NORMALIZATION */
523