1/*
2 *************************************************************************
3 * COPYRIGHT:
4 * Copyright (c) 1996-2005, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 *************************************************************************
7 */
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_NORMALIZATION
12
13#include "unicode/unistr.h"
14#include "unicode/chariter.h"
15#include "unicode/schriter.h"
16#include "unicode/uchriter.h"
17#include "unicode/uiter.h"
18#include "unicode/normlzr.h"
19#include "cmemory.h"
20#include "unormimp.h"
21
22U_NAMESPACE_BEGIN
23
24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
25
26//-------------------------------------------------------------------------
27// Constructors and other boilerplate
28//-------------------------------------------------------------------------
29
30Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
31    UObject(), fUMode(mode), fOptions(0),
32    currentIndex(0), nextIndex(0),
33    buffer(), bufferPos(0)
34{
35    init(new StringCharacterIterator(str));
36}
37
38Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
39    UObject(), fUMode(mode), fOptions(0),
40    currentIndex(0), nextIndex(0),
41    buffer(), bufferPos(0)
42{
43    init(new UCharCharacterIterator(str, length));
44}
45
46Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
47    UObject(), fUMode(mode), fOptions(0),
48    currentIndex(0), nextIndex(0),
49    buffer(), bufferPos(0)
50{
51    init(iter.clone());
52}
53
54Normalizer::Normalizer(const Normalizer &copy) :
55    UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions),
56    currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
57    buffer(copy.buffer), bufferPos(copy.bufferPos)
58{
59    init(((CharacterIterator *)(copy.text->context))->clone());
60}
61
62static const UChar _NUL=0;
63
64void
65Normalizer::init(CharacterIterator *iter) {
66    UErrorCode errorCode=U_ZERO_ERROR;
67
68    text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator));
69    if(text!=NULL) {
70        if(unorm_haveData(&errorCode)) {
71            uiter_setCharacterIterator(text, iter);
72        } else {
73            delete iter;
74            uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
75        }
76    } else {
77        delete iter;
78    }
79}
80
81Normalizer::~Normalizer()
82{
83    if(text!=NULL) {
84        delete (CharacterIterator *)text->context;
85        uprv_free(text);
86    }
87}
88
89Normalizer*
90Normalizer::clone() const
91{
92    if(this!=0) {
93        return new Normalizer(*this);
94    } else {
95        return 0;
96    }
97}
98
99/**
100 * Generates a hash code for this iterator.
101 */
102int32_t Normalizer::hashCode() const
103{
104    return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
105}
106
107UBool Normalizer::operator==(const Normalizer& that) const
108{
109    return
110        this==&that ||
111        fUMode==that.fUMode &&
112        fOptions==that.fOptions &&
113        *((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
114        buffer==that.buffer &&
115        bufferPos==that.bufferPos &&
116        nextIndex==that.nextIndex;
117}
118
119//-------------------------------------------------------------------------
120// Static utility methods
121//-------------------------------------------------------------------------
122
123void U_EXPORT2
124Normalizer::normalize(const UnicodeString& source,
125                      UNormalizationMode mode, int32_t options,
126                      UnicodeString& result,
127                      UErrorCode &status) {
128    if(source.isBogus() || U_FAILURE(status)) {
129        result.setToBogus();
130        if(U_SUCCESS(status)) {
131            status=U_ILLEGAL_ARGUMENT_ERROR;
132        }
133    } else {
134        UnicodeString localDest;
135        UnicodeString *dest;
136
137        if(&source!=&result) {
138            dest=&result;
139        } else {
140            // the source and result strings are the same object, use a temporary one
141            dest=&localDest;
142        }
143
144        UChar *buffer=dest->getBuffer(source.length());
145        int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
146                                               source.getBuffer(), source.length(),
147                                               mode, options,
148                                               &status);
149        dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
150        if(status==U_BUFFER_OVERFLOW_ERROR) {
151            status=U_ZERO_ERROR;
152            buffer=dest->getBuffer(length);
153            length=unorm_internalNormalize(buffer, dest->getCapacity(),
154                                           source.getBuffer(), source.length(),
155                                           mode, options,
156                                           &status);
157            dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
158        }
159
160        if(dest==&localDest) {
161            result=*dest;
162        }
163        if(U_FAILURE(status)) {
164            result.setToBogus();
165        }
166    }
167}
168
169void U_EXPORT2
170Normalizer::compose(const UnicodeString& source,
171                    UBool compat, int32_t options,
172                    UnicodeString& result,
173                    UErrorCode &status) {
174    if(source.isBogus() || U_FAILURE(status)) {
175        result.setToBogus();
176        if(U_SUCCESS(status)) {
177            status=U_ILLEGAL_ARGUMENT_ERROR;
178        }
179    } else {
180        UnicodeString localDest;
181        UnicodeString *dest;
182
183        if(&source!=&result) {
184            dest=&result;
185        } else {
186            // the source and result strings are the same object, use a temporary one
187            dest=&localDest;
188        }
189
190        UChar *buffer=dest->getBuffer(source.length());
191        int32_t length=unorm_compose(buffer, dest->getCapacity(),
192                                     source.getBuffer(), source.length(),
193                                     compat, options,
194                                     &status);
195        dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
196        if(status==U_BUFFER_OVERFLOW_ERROR) {
197            status=U_ZERO_ERROR;
198            buffer=dest->getBuffer(length);
199            length=unorm_compose(buffer, dest->getCapacity(),
200                                 source.getBuffer(), source.length(),
201                                 compat, options,
202                                 &status);
203            dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
204        }
205
206        if(dest==&localDest) {
207            result=*dest;
208        }
209        if(U_FAILURE(status)) {
210            result.setToBogus();
211        }
212    }
213}
214
215void U_EXPORT2
216Normalizer::decompose(const UnicodeString& source,
217                      UBool compat, int32_t options,
218                      UnicodeString& result,
219                      UErrorCode &status) {
220    if(source.isBogus() || U_FAILURE(status)) {
221        result.setToBogus();
222        if(U_SUCCESS(status)) {
223            status=U_ILLEGAL_ARGUMENT_ERROR;
224        }
225    } else {
226        UnicodeString localDest;
227        UnicodeString *dest;
228
229        if(&source!=&result) {
230            dest=&result;
231        } else {
232            // the source and result strings are the same object, use a temporary one
233            dest=&localDest;
234        }
235
236        UChar *buffer=dest->getBuffer(source.length());
237        int32_t length=unorm_decompose(buffer, dest->getCapacity(),
238                                     source.getBuffer(), source.length(),
239                                     compat, options,
240                                     &status);
241        dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
242        if(status==U_BUFFER_OVERFLOW_ERROR) {
243            status=U_ZERO_ERROR;
244            buffer=dest->getBuffer(length);
245            length=unorm_decompose(buffer, dest->getCapacity(),
246                                   source.getBuffer(), source.length(),
247                                   compat, options,
248                                   &status);
249            dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
250        }
251
252        if(dest==&localDest) {
253            result=*dest;
254        }
255        if(U_FAILURE(status)) {
256            result.setToBogus();
257        }
258    }
259}
260
261UnicodeString & U_EXPORT2
262Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
263                        UnicodeString &result,
264                        UNormalizationMode mode, int32_t options,
265                        UErrorCode &errorCode) {
266    if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
267        result.setToBogus();
268        if(U_SUCCESS(errorCode)) {
269            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
270        }
271    } else {
272        UnicodeString localDest;
273        UnicodeString *dest;
274
275        if(&left!=&result && &right!=&result) {
276            dest=&result;
277        } else {
278            // the source and result strings are the same object, use a temporary one
279            dest=&localDest;
280        }
281
282        UChar *buffer=dest->getBuffer(left.length()+right.length());
283        int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
284                                         right.getBuffer(), right.length(),
285                                         buffer, dest->getCapacity(),
286                                         mode, options,
287                                         &errorCode);
288        dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
289        if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
290            errorCode=U_ZERO_ERROR;
291            buffer=dest->getBuffer(length);
292            int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
293                                             right.getBuffer(), right.length(),
294                                             buffer, dest->getCapacity(),
295                                             mode, options,
296                                             &errorCode);
297            dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
298        }
299
300        if(dest==&localDest) {
301            result=*dest;
302        }
303        if(U_FAILURE(errorCode)) {
304            result.setToBogus();
305        }
306    }
307    return result;
308}
309
310//-------------------------------------------------------------------------
311// Iteration API
312//-------------------------------------------------------------------------
313
314/**
315 * Return the current character in the normalized text.
316 */
317UChar32 Normalizer::current() {
318    if(bufferPos<buffer.length() || nextNormalize()) {
319        return buffer.char32At(bufferPos);
320    } else {
321        return DONE;
322    }
323}
324
325/**
326 * Return the next character in the normalized text and advance
327 * the iteration position by one.  If the end
328 * of the text has already been reached, {@link #DONE} is returned.
329 */
330UChar32 Normalizer::next() {
331    if(bufferPos<buffer.length() ||  nextNormalize()) {
332        UChar32 c=buffer.char32At(bufferPos);
333        bufferPos+=UTF_CHAR_LENGTH(c);
334        return c;
335    } else {
336        return DONE;
337    }
338}
339
340/**
341 * Return the previous character in the normalized text and decrement
342 * the iteration position by one.  If the beginning
343 * of the text has already been reached, {@link #DONE} is returned.
344 */
345UChar32 Normalizer::previous() {
346    if(bufferPos>0 || previousNormalize()) {
347        UChar32 c=buffer.char32At(bufferPos-1);
348        bufferPos-=UTF_CHAR_LENGTH(c);
349        return c;
350    } else {
351        return DONE;
352    }
353}
354
355void Normalizer::reset() {
356    currentIndex=nextIndex=text->move(text, 0, UITER_START);
357    clearBuffer();
358}
359
360void
361Normalizer::setIndexOnly(int32_t index) {
362    currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
363    clearBuffer();
364}
365
366/**
367 * Return the first character in the normalized text->  This resets
368 * the <tt>Normalizer's</tt> position to the beginning of the text->
369 */
370UChar32 Normalizer::first() {
371    reset();
372    return next();
373}
374
375/**
376 * Return the last character in the normalized text->  This resets
377 * the <tt>Normalizer's</tt> position to be just before the
378 * the input text corresponding to that normalized character.
379 */
380UChar32 Normalizer::last() {
381    currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
382    clearBuffer();
383    return previous();
384}
385
386/**
387 * Retrieve the current iteration position in the input text that is
388 * being normalized.  This method is useful in applications such as
389 * searching, where you need to be able to determine the position in
390 * the input text that corresponds to a given normalized output character.
391 * <p>
392 * <b>Note:</b> This method sets the position in the <em>input</em>, while
393 * {@link #next} and {@link #previous} iterate through characters in the
394 * <em>output</em>.  This means that there is not necessarily a one-to-one
395 * correspondence between characters returned by <tt>next</tt> and
396 * <tt>previous</tt> and the indices passed to and returned from
397 * <tt>setIndex</tt> and {@link #getIndex}.
398 *
399 */
400int32_t Normalizer::getIndex() const {
401    if(bufferPos<buffer.length()) {
402        return currentIndex;
403    } else {
404        return nextIndex;
405    }
406}
407
408/**
409 * Retrieve the index of the start of the input text->  This is the begin index
410 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
411 * over which this <tt>Normalizer</tt> is iterating
412 */
413int32_t Normalizer::startIndex() const {
414    return text->getIndex(text, UITER_START);
415}
416
417/**
418 * Retrieve the index of the end of the input text->  This is the end index
419 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
420 * over which this <tt>Normalizer</tt> is iterating
421 */
422int32_t Normalizer::endIndex() const {
423    return text->getIndex(text, UITER_LIMIT);
424}
425
426//-------------------------------------------------------------------------
427// Property access methods
428//-------------------------------------------------------------------------
429
430void
431Normalizer::setMode(UNormalizationMode newMode)
432{
433    fUMode = newMode;
434}
435
436UNormalizationMode
437Normalizer::getUMode() const
438{
439    return fUMode;
440}
441
442void
443Normalizer::setOption(int32_t option,
444                      UBool value)
445{
446    if (value) {
447        fOptions |= option;
448    } else {
449        fOptions &= (~option);
450    }
451}
452
453UBool
454Normalizer::getOption(int32_t option) const
455{
456    return (fOptions & option) != 0;
457}
458
459/**
460 * Set the input text over which this <tt>Normalizer</tt> will iterate.
461 * The iteration position is set to the beginning of the input text->
462 */
463void
464Normalizer::setText(const UnicodeString& newText,
465                    UErrorCode &status)
466{
467    if (U_FAILURE(status)) {
468        return;
469    }
470    CharacterIterator *newIter = new StringCharacterIterator(newText);
471    if (newIter == NULL) {
472        status = U_MEMORY_ALLOCATION_ERROR;
473        return;
474    }
475    delete (CharacterIterator *)(text->context);
476    text->context = newIter;
477    reset();
478}
479
480/**
481 * Set the input text over which this <tt>Normalizer</tt> will iterate.
482 * The iteration position is set to the beginning of the string.
483 */
484void
485Normalizer::setText(const CharacterIterator& newText,
486                    UErrorCode &status)
487{
488    if (U_FAILURE(status)) {
489        return;
490    }
491    CharacterIterator *newIter = newText.clone();
492    if (newIter == NULL) {
493        status = U_MEMORY_ALLOCATION_ERROR;
494        return;
495    }
496    delete (CharacterIterator *)(text->context);
497    text->context = newIter;
498    reset();
499}
500
501void
502Normalizer::setText(const UChar* newText,
503                    int32_t length,
504                    UErrorCode &status)
505{
506    if (U_FAILURE(status)) {
507        return;
508    }
509    CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
510    if (newIter == NULL) {
511        status = U_MEMORY_ALLOCATION_ERROR;
512        return;
513    }
514    delete (CharacterIterator *)(text->context);
515    text->context = newIter;
516    reset();
517}
518
519/**
520 * Copies the text under iteration into the UnicodeString referred to by "result".
521 * @param result Receives a copy of the text under iteration.
522 */
523void
524Normalizer::getText(UnicodeString&  result)
525{
526    ((CharacterIterator *)(text->context))->getText(result);
527}
528
529//-------------------------------------------------------------------------
530// Private utility methods
531//-------------------------------------------------------------------------
532
533void Normalizer::clearBuffer() {
534    buffer.remove();
535    bufferPos=0;
536}
537
538UBool
539Normalizer::nextNormalize() {
540    UChar *p;
541    int32_t length;
542    UErrorCode errorCode;
543
544    clearBuffer();
545    currentIndex=nextIndex;
546    text->move(text, nextIndex, UITER_ZERO);
547    if(!text->hasNext(text)) {
548        return FALSE;
549    }
550
551    errorCode=U_ZERO_ERROR;
552    p=buffer.getBuffer(-1);
553    length=unorm_next(text, p, buffer.getCapacity(),
554                      fUMode, fOptions,
555                      TRUE, 0,
556                      &errorCode);
557    buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
558    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
559        errorCode=U_ZERO_ERROR;
560        text->move(text, nextIndex, UITER_ZERO);
561        p=buffer.getBuffer(length);
562        length=unorm_next(text, p, buffer.getCapacity(),
563                          fUMode, fOptions,
564                          TRUE, 0,
565                          &errorCode);
566        buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
567    }
568
569    nextIndex=text->getIndex(text, UITER_CURRENT);
570    return U_SUCCESS(errorCode) && !buffer.isEmpty();
571}
572
573UBool
574Normalizer::previousNormalize() {
575    UChar *p;
576    int32_t length;
577    UErrorCode errorCode;
578
579    clearBuffer();
580    nextIndex=currentIndex;
581    text->move(text, currentIndex, UITER_ZERO);
582    if(!text->hasPrevious(text)) {
583        return FALSE;
584    }
585
586    errorCode=U_ZERO_ERROR;
587    p=buffer.getBuffer(-1);
588    length=unorm_previous(text, p, buffer.getCapacity(),
589                          fUMode, fOptions,
590                          TRUE, 0,
591                          &errorCode);
592    buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
593    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
594        errorCode=U_ZERO_ERROR;
595        text->move(text, currentIndex, UITER_ZERO);
596        p=buffer.getBuffer(length);
597        length=unorm_previous(text, p, buffer.getCapacity(),
598                              fUMode, fOptions,
599                              TRUE, 0,
600                              &errorCode);
601        buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
602    }
603
604    bufferPos=buffer.length();
605    currentIndex=text->getIndex(text, UITER_CURRENT);
606    return U_SUCCESS(errorCode) && !buffer.isEmpty();
607}
608
609U_NAMESPACE_END
610
611#endif /* #if !UCONFIG_NO_NORMALIZATION */
612