nortrans.cpp revision 64339d36f8bd4db5025fe2988eda22b491a9219c
1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5*   Copyright (C) 2001-2011, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7**********************************************************************
8*   Date        Name        Description
9*   07/03/01    aliu        Creation.
10**********************************************************************
11*/
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
17#include "unicode/normalizer2.h"
18#include "unicode/utf16.h"
19#include "cstring.h"
20#include "nortrans.h"
21
22U_NAMESPACE_BEGIN
23
24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
25
26static inline Transliterator::Token cstrToken(const char *s) {
27    return Transliterator::pointerToken((void *)s);
28}
29
30/**
31 * System registration hook.
32 */
33void NormalizationTransliterator::registerIDs() {
34    // In the Token, the byte after the NUL is the UNormalization2Mode.
35    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
36                                     _create, cstrToken("nfc\0\0"));
37    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
38                                     _create, cstrToken("nfkc\0\0"));
39    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
40                                     _create, cstrToken("nfc\0\1"));
41    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
42                                     _create, cstrToken("nfkc\0\1"));
43    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
44                                     _create, cstrToken("nfc\0\2"));
45    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
46                                     _create, cstrToken("nfc\0\3"));
47    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
48                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
49    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
50                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
51    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
52                                            UNICODE_STRING_SIMPLE("NFD"), FALSE);
53    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
54                                            UNICODE_STRING_SIMPLE("FCD"), FALSE);
55}
56
57/**
58 * Factory methods
59 */
60Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
61                                                     Token context) {
62    const char *name = (const char *)context.pointer;
63    UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
64    UErrorCode errorCode = U_ZERO_ERROR;
65    const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
66    if(U_SUCCESS(errorCode)) {
67        return new NormalizationTransliterator(ID, *norm2);
68    } else {
69        return NULL;
70    }
71}
72
73/**
74 * Constructs a transliterator.
75 */
76NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
77                                                         const Normalizer2 &norm2) :
78    Transliterator(id, 0), fNorm2(norm2) {}
79
80/**
81 * Destructor.
82 */
83NormalizationTransliterator::~NormalizationTransliterator() {
84}
85
86/**
87 * Copy constructor.
88 */
89NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
90    Transliterator(o), fNorm2(o.fNorm2) {}
91
92/**
93 * Transliterator API.
94 */
95Transliterator* NormalizationTransliterator::clone(void) const {
96    return new NormalizationTransliterator(*this);
97}
98
99/**
100 * Implements {@link Transliterator#handleTransliterate}.
101 */
102void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103                                                      UBool isIncremental) const {
104    // start and limit of the input range
105    int32_t start = offsets.start;
106    int32_t limit = offsets.limit;
107    if(start >= limit) {
108        return;
109    }
110
111    /*
112     * Normalize as short chunks at a time as possible even in
113     * bulk mode, so that styled text is minimally disrupted.
114     * In incremental mode, a chunk that ends with offsets.limit
115     * must not be normalized.
116     *
117     * If it was known that the input text is not styled, then
118     * a bulk mode normalization could look like this:
119
120    UnicodeString input, normalized;
121    int32_t length = limit - start;
122    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
123    input.releaseBuffer(length);
124
125    UErrorCode status = U_ZERO_ERROR;
126    fNorm2.normalize(input, normalized, status);
127
128    text.handleReplaceBetween(start, limit, normalized);
129
130    int32_t delta = normalized.length() - length;
131    offsets.contextLimit += delta;
132    offsets.limit += delta;
133    offsets.start = limit + delta;
134
135     */
136    UErrorCode errorCode = U_ZERO_ERROR;
137    UnicodeString segment;
138    UnicodeString normalized;
139    UChar32 c = text.char32At(start);
140    do {
141        int32_t prev = start;
142        // Skip at least one character so we make progress.
143        // c holds the character at start.
144        segment.remove();
145        do {
146            segment.append(c);
147            start += U16_LENGTH(c);
148        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
149        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
150            // stop in incremental mode when we reach the input limit
151            // in case there are additional characters that could change the
152            // normalization result
153            start=prev;
154            break;
155        }
156        fNorm2.normalize(segment, normalized, errorCode);
157        if(U_FAILURE(errorCode)) {
158            break;
159        }
160        if(segment != normalized) {
161            // replace the input chunk with its normalized form
162            text.handleReplaceBetween(prev, start, normalized);
163
164            // update all necessary indexes accordingly
165            int32_t delta = normalized.length() - (start - prev);
166            start += delta;
167            limit += delta;
168        }
169    } while(start < limit);
170
171    offsets.start = start;
172    offsets.contextLimit += limit - offsets.limit;
173    offsets.limit = limit;
174}
175
176U_NAMESPACE_END
177
178#endif /* #if !UCONFIG_NO_TRANSLITERATION */
179