nortrans.cpp revision 50294ead5e5d23f5bbfed76e00e6b510bd41eee1
1/*
2**********************************************************************
3*   Copyright (C) 2001-2010, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   07/03/01    aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/normalizer2.h"
16#include "cstring.h"
17#include "nortrans.h"
18
19U_NAMESPACE_BEGIN
20
21UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
22
23static inline Transliterator::Token cstrToken(const char *s) {
24    return Transliterator::pointerToken((void *)s);
25}
26
27/**
28 * System registration hook.
29 */
30void NormalizationTransliterator::registerIDs() {
31    // In the Token, the byte after the NUL is the UNormalization2Mode.
32    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
33                                     _create, cstrToken("nfc\0\0"));
34    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
35                                     _create, cstrToken("nfkc\0\0"));
36    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
37                                     _create, cstrToken("nfc\0\1"));
38    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
39                                     _create, cstrToken("nfkc\0\1"));
40    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
41                                     _create, cstrToken("nfc\0\2"));
42    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
43                                     _create, cstrToken("nfc\0\3"));
44    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
45                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
46    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
47                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
48    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
49                                            UNICODE_STRING_SIMPLE("NFD"), FALSE);
50    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
51                                            UNICODE_STRING_SIMPLE("FCD"), FALSE);
52}
53
54/**
55 * Factory methods
56 */
57Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
58                                                     Token context) {
59    const char *name = (const char *)context.pointer;
60    UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
61    UErrorCode errorCode = U_ZERO_ERROR;
62    const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
63    if(U_SUCCESS(errorCode)) {
64        return new NormalizationTransliterator(ID, *norm2);
65    } else {
66        return NULL;
67    }
68}
69
70/**
71 * Constructs a transliterator.
72 */
73NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
74                                                         const Normalizer2 &norm2) :
75    Transliterator(id, 0), fNorm2(norm2) {}
76
77/**
78 * Destructor.
79 */
80NormalizationTransliterator::~NormalizationTransliterator() {
81}
82
83/**
84 * Copy constructor.
85 */
86NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
87    Transliterator(o), fNorm2(o.fNorm2) {}
88
89/**
90 * Transliterator API.
91 */
92Transliterator* NormalizationTransliterator::clone(void) const {
93    return new NormalizationTransliterator(*this);
94}
95
96/**
97 * Implements {@link Transliterator#handleTransliterate}.
98 */
99void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
100                                                      UBool isIncremental) const {
101    // start and limit of the input range
102    int32_t start = offsets.start;
103    int32_t limit = offsets.limit;
104    if(start >= limit) {
105        return;
106    }
107
108    /*
109     * Normalize as short chunks at a time as possible even in
110     * bulk mode, so that styled text is minimally disrupted.
111     * In incremental mode, a chunk that ends with offsets.limit
112     * must not be normalized.
113     *
114     * If it was known that the input text is not styled, then
115     * a bulk mode normalization could look like this:
116
117    UnicodeString input, normalized;
118    int32_t length = limit - start;
119    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
120    input.releaseBuffer(length);
121
122    UErrorCode status = U_ZERO_ERROR;
123    fNorm2.normalize(input, normalized, status);
124
125    text.handleReplaceBetween(start, limit, normalized);
126
127    int32_t delta = normalized.length() - length;
128    offsets.contextLimit += delta;
129    offsets.limit += delta;
130    offsets.start = limit + delta;
131
132     */
133    UErrorCode errorCode = U_ZERO_ERROR;
134    UnicodeString segment;
135    UnicodeString normalized;
136    UChar32 c = text.char32At(start);
137    do {
138        int32_t prev = start;
139        // Skip at least one character so we make progress.
140        // c holds the character at start.
141        segment.remove();
142        do {
143            segment.append(c);
144            start += U16_LENGTH(c);
145        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
146        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
147            // stop in incremental mode when we reach the input limit
148            // in case there are additional characters that could change the
149            // normalization result
150            start=prev;
151            break;
152        }
153        fNorm2.normalize(segment, normalized, errorCode);
154        if(U_FAILURE(errorCode)) {
155            break;
156        }
157        if(segment != normalized) {
158            // replace the input chunk with its normalized form
159            text.handleReplaceBetween(prev, start, normalized);
160
161            // update all necessary indexes accordingly
162            int32_t delta = normalized.length() - (start - prev);
163            start += delta;
164            limit += delta;
165        }
166    } while(start < limit);
167
168    offsets.start = start;
169    offsets.contextLimit += limit - offsets.limit;
170    offsets.limit = limit;
171}
172
173U_NAMESPACE_END
174
175#endif /* #if !UCONFIG_NO_TRANSLITERATION */
176