164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others.
264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html
3b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/*
4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius*   Copyright (C) 2001-2011, International Business Machines
6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*   Date        Name        Description
9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*   07/03/01    aliu        Creation.
10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru**********************************************************************
11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru*/
12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h"
14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_TRANSLITERATION
16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
1750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/normalizer2.h"
1883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius#include "unicode/utf16.h"
1950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "cstring.h"
20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "nortrans.h"
21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN
23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
2650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic inline Transliterator::Token cstrToken(const char *s) {
2750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return Transliterator::pointerToken((void *)s);
2850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
2950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * System registration hook.
32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid NormalizationTransliterator::registerIDs() {
3450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // In the Token, the byte after the NUL is the UNormalization2Mode.
35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
3650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfc\0\0"));
37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
3850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfkc\0\0"));
39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
4050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfc\0\1"));
41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
4250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfkc\0\1"));
4350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
4450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfc\0\2"));
4550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
4650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                     _create, cstrToken("nfc\0\3"));
47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
5250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                            UNICODE_STRING_SIMPLE("NFD"), FALSE);
5350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
5450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                            UNICODE_STRING_SIMPLE("FCD"), FALSE);
55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Factory methods
59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruTransliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                                     Token context) {
6250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const char *name = (const char *)context.pointer;
6350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
6450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UErrorCode errorCode = U_ZERO_ERROR;
6550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(U_SUCCESS(errorCode)) {
6750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return new NormalizationTransliterator(ID, *norm2);
6850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } else {
6950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return NULL;
7050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Constructs a transliterator.
75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
7650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
7750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                                         const Normalizer2 &norm2) :
7850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator(id, 0), fNorm2(norm2) {}
79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Destructor.
82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruNormalizationTransliterator::~NormalizationTransliterator() {
84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Copy constructor.
88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruNormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
9050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Transliterator(o), fNorm2(o.fNorm2) {}
91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Transliterator API.
94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruTransliterator* NormalizationTransliterator::clone(void) const {
96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    return new NormalizationTransliterator(*this);
97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru/**
100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru * Implements {@link Transliterator#handleTransliterate}.
101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru */
102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru                                                      UBool isIncremental) const {
104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    // start and limit of the input range
105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t start = offsets.start;
106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    int32_t limit = offsets.limit;
107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    if(start >= limit) {
108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        return;
109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    }
110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    /*
112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * Normalize as short chunks at a time as possible even in
113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * bulk mode, so that styled text is minimally disrupted.
114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * In incremental mode, a chunk that ends with offsets.limit
115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * must not be normalized.
116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     *
117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * If it was known that the input text is not styled, then
118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     * a bulk mode normalization could look like this:
119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
12050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString input, normalized;
12150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t length = limit - start;
122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    input.releaseBuffer(length);
124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    fNorm2.normalize(input, normalized, status);
127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
12850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    text.handleReplaceBetween(start, limit, normalized);
129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
13050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t delta = normalized.length() - length;
131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    offsets.contextLimit += delta;
132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    offsets.limit += delta;
133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    offsets.start = limit + delta;
134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru     */
13650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UErrorCode errorCode = U_ZERO_ERROR;
13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString segment;
13850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString normalized;
13950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UChar32 c = text.char32At(start);
14050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    do {
14150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        int32_t prev = start;
14250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Skip at least one character so we make progress.
14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // c holds the character at start.
14450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        segment.remove();
14550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        do {
14650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            segment.append(c);
14750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            start += U16_LENGTH(c);
14850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
14950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // stop in incremental mode when we reach the input limit
151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // in case there are additional characters that could change the
152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // normalization result
15350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            start=prev;
15450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
15650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fNorm2.normalize(segment, normalized, errorCode);
15750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(U_FAILURE(errorCode)) {
15850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
15950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
16050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(segment != normalized) {
161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // replace the input chunk with its normalized form
16250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            text.handleReplaceBetween(prev, start, normalized);
163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru            // update all necessary indexes accordingly
16550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            int32_t delta = normalized.length() - (start - prev);
16650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            start += delta;
16750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            limit += delta;
168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru        }
16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } while(start < limit);
170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru    offsets.start = start;
17250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    offsets.contextLimit += limit - offsets.limit;
17350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    offsets.limit = limit;
174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}
175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END
177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru
178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_TRANSLITERATION */
179