1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/*
2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   Copyright (C) 2001-2010, International Business Machines
4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*   Date        Name        Description
7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*   07/03/01    aliu        Creation.
8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/
10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h"
12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_TRANSLITERATION
14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
1550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/normalizer2.h"
1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "cstring.h"
17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "nortrans.h"
18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN
20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
2350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic inline Transliterator::Token cstrToken(const char *s) {
2450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return Transliterator::pointerToken((void *)s);
2550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
2650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * System registration hook.
29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid NormalizationTransliterator::registerIDs() {
3150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // In the Token, the byte after the NUL is the UNormalization2Mode.
32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
3350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfc\0\0"));
34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
3550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfkc\0\0"));
36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfc\0\1"));
38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
3950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfkc\0\1"));
4050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
4150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfc\0\2"));
4250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
4350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfc\0\3"));
44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
4850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
4950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                            UNICODE_STRING_SIMPLE("NFD"), FALSE);
5050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                            UNICODE_STRING_SIMPLE("FCD"), FALSE);
52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Factory methods
56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruTransliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                                     Token context) {
5950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const char *name = (const char *)context.pointer;
6050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
6150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UErrorCode errorCode = U_ZERO_ERROR;
6250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
6350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(U_SUCCESS(errorCode)) {
6450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return new NormalizationTransliterator(ID, *norm2);
6550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } else {
6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return NULL;
6750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Constructs a transliterator.
72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
7350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
7450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                                         const Normalizer2 &norm2) :
7550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator(id, 0), fNorm2(norm2) {}
76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Destructor.
79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruNormalizationTransliterator::~NormalizationTransliterator() {
81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Copy constructor.
85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruNormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator(o), fNorm2(o.fNorm2) {}
88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Transliterator API.
91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruTransliterator* NormalizationTransliterator::clone(void) const {
93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    return new NormalizationTransliterator(*this);
94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Implements {@link Transliterator#handleTransliterate}.
98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                                      UBool isIncremental) const {
101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // start and limit of the input range
102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t start = offsets.start;
103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t limit = offsets.limit;
104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if(start >= limit) {
105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        return;
106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    /*
109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * Normalize as short chunks at a time as possible even in
110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * bulk mode, so that styled text is minimally disrupted.
111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * In incremental mode, a chunk that ends with offsets.limit
112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * must not be normalized.
113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     *
114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * If it was known that the input text is not styled, then
115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * a bulk mode normalization could look like this:
116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
11750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString input, normalized;
11850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t length = limit - start;
119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    input.releaseBuffer(length);
121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    fNorm2.normalize(input, normalized, status);
124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
12550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    text.handleReplaceBetween(start, limit, normalized);
126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
12750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t delta = normalized.length() - length;
128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    offsets.contextLimit += delta;
129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    offsets.limit += delta;
130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    offsets.start = limit + delta;
131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     */
13350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UErrorCode errorCode = U_ZERO_ERROR;
13450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString segment;
13550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString normalized;
13650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UChar32 c = text.char32At(start);
13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    do {
13850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        int32_t prev = start;
13950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Skip at least one character so we make progress.
14050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // c holds the character at start.
14150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        segment.remove();
14250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        do {
14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            segment.append(c);
14450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            start += U16_LENGTH(c);
14550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
14650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // stop in incremental mode when we reach the input limit
148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // in case there are additional characters that could change the
149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // normalization result
15050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            start=prev;
15150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
15350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fNorm2.normalize(segment, normalized, errorCode);
15450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(U_FAILURE(errorCode)) {
15550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
15650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
15750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(segment != normalized) {
158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // replace the input chunk with its normalized form
15950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            text.handleReplaceBetween(prev, start, normalized);
160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // update all necessary indexes accordingly
16250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            int32_t delta = normalized.length() - (start - prev);
16350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            start += delta;
16450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            limit += delta;
165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
16650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } while(start < limit);
167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    offsets.start = start;
16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    offsets.contextLimit += limit - offsets.limit;
17050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    offsets.limit = limit;
171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END
174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_TRANSLITERATION */
176