normlzr.cpp revision c73f511526464f8e56c242df80552e9b0d94ae3d
1/*
2 *************************************************************************
3 * COPYRIGHT:
4 * Copyright (c) 1996-2012, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 *************************************************************************
7 */
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_NORMALIZATION
12
13#include "unicode/uniset.h"
14#include "unicode/unistr.h"
15#include "unicode/chariter.h"
16#include "unicode/schriter.h"
17#include "unicode/uchriter.h"
18#include "unicode/normlzr.h"
19#include "unicode/utf16.h"
20#include "cmemory.h"
21#include "normalizer2impl.h"
22#include "uprops.h"  // for uniset_getUnicode32Instance()
23
24U_NAMESPACE_BEGIN
25
26UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
27
28//-------------------------------------------------------------------------
29// Constructors and other boilerplate
30//-------------------------------------------------------------------------
31
32Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
33    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
34    text(new StringCharacterIterator(str)),
35    currentIndex(0), nextIndex(0),
36    buffer(), bufferPos(0)
37{
38    init();
39}
40
41Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
42    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
43    text(new UCharCharacterIterator(str, length)),
44    currentIndex(0), nextIndex(0),
45    buffer(), bufferPos(0)
46{
47    init();
48}
49
50Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
51    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
52    text(iter.clone()),
53    currentIndex(0), nextIndex(0),
54    buffer(), bufferPos(0)
55{
56    init();
57}
58
59Normalizer::Normalizer(const Normalizer &copy) :
60    UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
61    text(copy.text->clone()),
62    currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
63    buffer(copy.buffer), bufferPos(copy.bufferPos)
64{
65    init();
66}
67
68void
69Normalizer::init() {
70    UErrorCode errorCode=U_ZERO_ERROR;
71    fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
72    if(fOptions&UNORM_UNICODE_3_2) {
73        delete fFilteredNorm2;
74        fNorm2=fFilteredNorm2=
75            new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
76    }
77    if(U_FAILURE(errorCode)) {
78        errorCode=U_ZERO_ERROR;
79        fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
80    }
81}
82
83Normalizer::~Normalizer()
84{
85    delete fFilteredNorm2;
86    delete text;
87}
88
89Normalizer*
90Normalizer::clone() const
91{
92    return new Normalizer(*this);
93}
94
95/**
96 * Generates a hash code for this iterator.
97 */
98int32_t Normalizer::hashCode() const
99{
100    return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
101}
102
103UBool Normalizer::operator==(const Normalizer& that) const
104{
105    return
106        this==&that ||
107        (fUMode==that.fUMode &&
108        fOptions==that.fOptions &&
109        *text==*that.text &&
110        buffer==that.buffer &&
111        bufferPos==that.bufferPos &&
112        nextIndex==that.nextIndex);
113}
114
115//-------------------------------------------------------------------------
116// Static utility methods
117//-------------------------------------------------------------------------
118
119void U_EXPORT2
120Normalizer::normalize(const UnicodeString& source,
121                      UNormalizationMode mode, int32_t options,
122                      UnicodeString& result,
123                      UErrorCode &status) {
124    if(source.isBogus() || U_FAILURE(status)) {
125        result.setToBogus();
126        if(U_SUCCESS(status)) {
127            status=U_ILLEGAL_ARGUMENT_ERROR;
128        }
129    } else {
130        UnicodeString localDest;
131        UnicodeString *dest;
132
133        if(&source!=&result) {
134            dest=&result;
135        } else {
136            // the source and result strings are the same object, use a temporary one
137            dest=&localDest;
138        }
139        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
140        if(U_SUCCESS(status)) {
141            if(options&UNORM_UNICODE_3_2) {
142                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
143                    normalize(source, *dest, status);
144            } else {
145                n2->normalize(source, *dest, status);
146            }
147        }
148        if(dest==&localDest && U_SUCCESS(status)) {
149            result=*dest;
150        }
151    }
152}
153
154void U_EXPORT2
155Normalizer::compose(const UnicodeString& source,
156                    UBool compat, int32_t options,
157                    UnicodeString& result,
158                    UErrorCode &status) {
159    normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
160}
161
162void U_EXPORT2
163Normalizer::decompose(const UnicodeString& source,
164                      UBool compat, int32_t options,
165                      UnicodeString& result,
166                      UErrorCode &status) {
167    normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
168}
169
170UNormalizationCheckResult
171Normalizer::quickCheck(const UnicodeString& source,
172                       UNormalizationMode mode, int32_t options,
173                       UErrorCode &status) {
174    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
175    if(U_SUCCESS(status)) {
176        if(options&UNORM_UNICODE_3_2) {
177            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
178                quickCheck(source, status);
179        } else {
180            return n2->quickCheck(source, status);
181        }
182    } else {
183        return UNORM_MAYBE;
184    }
185}
186
187UBool
188Normalizer::isNormalized(const UnicodeString& source,
189                         UNormalizationMode mode, int32_t options,
190                         UErrorCode &status) {
191    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
192    if(U_SUCCESS(status)) {
193        if(options&UNORM_UNICODE_3_2) {
194            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
195                isNormalized(source, status);
196        } else {
197            return n2->isNormalized(source, status);
198        }
199    } else {
200        return FALSE;
201    }
202}
203
204UnicodeString & U_EXPORT2
205Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
206                        UnicodeString &result,
207                        UNormalizationMode mode, int32_t options,
208                        UErrorCode &errorCode) {
209    if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
210        result.setToBogus();
211        if(U_SUCCESS(errorCode)) {
212            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
213        }
214    } else {
215        UnicodeString localDest;
216        UnicodeString *dest;
217
218        if(&right!=&result) {
219            dest=&result;
220        } else {
221            // the right and result strings are the same object, use a temporary one
222            dest=&localDest;
223        }
224        *dest=left;
225        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
226        if(U_SUCCESS(errorCode)) {
227            if(options&UNORM_UNICODE_3_2) {
228                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
229                    append(*dest, right, errorCode);
230            } else {
231                n2->append(*dest, right, errorCode);
232            }
233        }
234        if(dest==&localDest && U_SUCCESS(errorCode)) {
235            result=*dest;
236        }
237    }
238    return result;
239}
240
241//-------------------------------------------------------------------------
242// Iteration API
243//-------------------------------------------------------------------------
244
245/**
246 * Return the current character in the normalized text.
247 */
248UChar32 Normalizer::current() {
249    if(bufferPos<buffer.length() || nextNormalize()) {
250        return buffer.char32At(bufferPos);
251    } else {
252        return DONE;
253    }
254}
255
256/**
257 * Return the next character in the normalized text and advance
258 * the iteration position by one.  If the end
259 * of the text has already been reached, {@link #DONE} is returned.
260 */
261UChar32 Normalizer::next() {
262    if(bufferPos<buffer.length() ||  nextNormalize()) {
263        UChar32 c=buffer.char32At(bufferPos);
264        bufferPos+=U16_LENGTH(c);
265        return c;
266    } else {
267        return DONE;
268    }
269}
270
271/**
272 * Return the previous character in the normalized text and decrement
273 * the iteration position by one.  If the beginning
274 * of the text has already been reached, {@link #DONE} is returned.
275 */
276UChar32 Normalizer::previous() {
277    if(bufferPos>0 || previousNormalize()) {
278        UChar32 c=buffer.char32At(bufferPos-1);
279        bufferPos-=U16_LENGTH(c);
280        return c;
281    } else {
282        return DONE;
283    }
284}
285
286void Normalizer::reset() {
287    currentIndex=nextIndex=text->setToStart();
288    clearBuffer();
289}
290
291void
292Normalizer::setIndexOnly(int32_t index) {
293    text->setIndex(index);  // pins index
294    currentIndex=nextIndex=text->getIndex();
295    clearBuffer();
296}
297
298/**
299 * Return the first character in the normalized text.  This resets
300 * the <tt>Normalizer's</tt> position to the beginning of the text.
301 */
302UChar32 Normalizer::first() {
303    reset();
304    return next();
305}
306
307/**
308 * Return the last character in the normalized text.  This resets
309 * the <tt>Normalizer's</tt> position to be just before the
310 * the input text corresponding to that normalized character.
311 */
312UChar32 Normalizer::last() {
313    currentIndex=nextIndex=text->setToEnd();
314    clearBuffer();
315    return previous();
316}
317
318/**
319 * Retrieve the current iteration position in the input text that is
320 * being normalized.  This method is useful in applications such as
321 * searching, where you need to be able to determine the position in
322 * the input text that corresponds to a given normalized output character.
323 * <p>
324 * <b>Note:</b> This method sets the position in the <em>input</em>, while
325 * {@link #next} and {@link #previous} iterate through characters in the
326 * <em>output</em>.  This means that there is not necessarily a one-to-one
327 * correspondence between characters returned by <tt>next</tt> and
328 * <tt>previous</tt> and the indices passed to and returned from
329 * <tt>setIndex</tt> and {@link #getIndex}.
330 *
331 */
332int32_t Normalizer::getIndex() const {
333    if(bufferPos<buffer.length()) {
334        return currentIndex;
335    } else {
336        return nextIndex;
337    }
338}
339
340/**
341 * Retrieve the index of the start of the input text.  This is the begin index
342 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
343 * over which this <tt>Normalizer</tt> is iterating
344 */
345int32_t Normalizer::startIndex() const {
346    return text->startIndex();
347}
348
349/**
350 * Retrieve the index of the end of the input text.  This is the end index
351 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
352 * over which this <tt>Normalizer</tt> is iterating
353 */
354int32_t Normalizer::endIndex() const {
355    return text->endIndex();
356}
357
358//-------------------------------------------------------------------------
359// Property access methods
360//-------------------------------------------------------------------------
361
362void
363Normalizer::setMode(UNormalizationMode newMode)
364{
365    fUMode = newMode;
366    init();
367}
368
369UNormalizationMode
370Normalizer::getUMode() const
371{
372    return fUMode;
373}
374
375void
376Normalizer::setOption(int32_t option,
377                      UBool value)
378{
379    if (value) {
380        fOptions |= option;
381    } else {
382        fOptions &= (~option);
383    }
384    init();
385}
386
387UBool
388Normalizer::getOption(int32_t option) const
389{
390    return (fOptions & option) != 0;
391}
392
393/**
394 * Set the input text over which this <tt>Normalizer</tt> will iterate.
395 * The iteration position is set to the beginning of the input text.
396 */
397void
398Normalizer::setText(const UnicodeString& newText,
399                    UErrorCode &status)
400{
401    if (U_FAILURE(status)) {
402        return;
403    }
404    CharacterIterator *newIter = new StringCharacterIterator(newText);
405    if (newIter == NULL) {
406        status = U_MEMORY_ALLOCATION_ERROR;
407        return;
408    }
409    delete text;
410    text = newIter;
411    reset();
412}
413
414/**
415 * Set the input text over which this <tt>Normalizer</tt> will iterate.
416 * The iteration position is set to the beginning of the string.
417 */
418void
419Normalizer::setText(const CharacterIterator& newText,
420                    UErrorCode &status)
421{
422    if (U_FAILURE(status)) {
423        return;
424    }
425    CharacterIterator *newIter = newText.clone();
426    if (newIter == NULL) {
427        status = U_MEMORY_ALLOCATION_ERROR;
428        return;
429    }
430    delete text;
431    text = newIter;
432    reset();
433}
434
435void
436Normalizer::setText(const UChar* newText,
437                    int32_t length,
438                    UErrorCode &status)
439{
440    if (U_FAILURE(status)) {
441        return;
442    }
443    CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
444    if (newIter == NULL) {
445        status = U_MEMORY_ALLOCATION_ERROR;
446        return;
447    }
448    delete text;
449    text = newIter;
450    reset();
451}
452
453/**
454 * Copies the text under iteration into the UnicodeString referred to by "result".
455 * @param result Receives a copy of the text under iteration.
456 */
457void
458Normalizer::getText(UnicodeString&  result)
459{
460    text->getText(result);
461}
462
463//-------------------------------------------------------------------------
464// Private utility methods
465//-------------------------------------------------------------------------
466
467void Normalizer::clearBuffer() {
468    buffer.remove();
469    bufferPos=0;
470}
471
472UBool
473Normalizer::nextNormalize() {
474    clearBuffer();
475    currentIndex=nextIndex;
476    text->setIndex(nextIndex);
477    if(!text->hasNext()) {
478        return FALSE;
479    }
480    // Skip at least one character so we make progress.
481    UnicodeString segment(text->next32PostInc());
482    while(text->hasNext()) {
483        UChar32 c;
484        if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
485            text->move32(-1, CharacterIterator::kCurrent);
486            break;
487        }
488        segment.append(c);
489    }
490    nextIndex=text->getIndex();
491    UErrorCode errorCode=U_ZERO_ERROR;
492    fNorm2->normalize(segment, buffer, errorCode);
493    return U_SUCCESS(errorCode) && !buffer.isEmpty();
494}
495
496UBool
497Normalizer::previousNormalize() {
498    clearBuffer();
499    nextIndex=currentIndex;
500    text->setIndex(currentIndex);
501    if(!text->hasPrevious()) {
502        return FALSE;
503    }
504    UnicodeString segment;
505    while(text->hasPrevious()) {
506        UChar32 c=text->previous32();
507        segment.insert(0, c);
508        if(fNorm2->hasBoundaryBefore(c)) {
509            break;
510        }
511    }
512    currentIndex=text->getIndex();
513    UErrorCode errorCode=U_ZERO_ERROR;
514    fNorm2->normalize(segment, buffer, errorCode);
515    bufferPos=buffer.length();
516    return U_SUCCESS(errorCode) && !buffer.isEmpty();
517}
518
519U_NAMESPACE_END
520
521#endif /* #if !UCONFIG_NO_NORMALIZATION */
522