16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org**********************************************************************
36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Copyright (C) 2001-2011, International Business Machines
46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Corporation and others.  All Rights Reserved.
56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org**********************************************************************
66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Date        Name        Description
76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   07/03/01    aliu        Creation.
86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org**********************************************************************
96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h"
126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_TRANSLITERATION
146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/normalizer2.h"
166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utf16.h"
176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cstring.h"
186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "nortrans.h"
196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN
216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline Transliterator::Token cstrToken(const char *s) {
256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return Transliterator::pointerToken((void *)s);
266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * System registration hook.
306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid NormalizationTransliterator::registerIDs() {
326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // In the Token, the byte after the NUL is the UNormalization2Mode.
336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                     _create, cstrToken("nfc\0\0"));
356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                     _create, cstrToken("nfkc\0\0"));
376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                     _create, cstrToken("nfc\0\1"));
396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                     _create, cstrToken("nfkc\0\1"));
416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                     _create, cstrToken("nfc\0\2"));
436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                     _create, cstrToken("nfc\0\3"));
456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                            UNICODE_STRING_SIMPLE("NFD"), FALSE);
516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                            UNICODE_STRING_SIMPLE("FCD"), FALSE);
536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Factory methods
576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgTransliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                                     Token context) {
606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *name = (const char *)context.pointer;
616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode errorCode = U_ZERO_ERROR;
636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if(U_SUCCESS(errorCode)) {
656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return new NormalizationTransliterator(ID, *norm2);
666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } else {
676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return NULL;
686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Constructs a transliterator.
736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgNormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                                         const Normalizer2 &norm2) :
766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator(id, 0), fNorm2(norm2) {}
776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Destructor.
806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgNormalizationTransliterator::~NormalizationTransliterator() {
826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copy constructor.
866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgNormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    Transliterator(o), fNorm2(o.fNorm2) {}
896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Transliterator API.
926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgTransliterator* NormalizationTransliterator::clone(void) const {
946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    return new NormalizationTransliterator(*this);
956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Implements {@link Transliterator#handleTransliterate}.
996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                                                      UBool isIncremental) const {
1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // start and limit of the input range
1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t start = offsets.start;
1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t limit = offsets.limit;
1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    if(start >= limit) {
1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return;
1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /*
1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Normalize as short chunks at a time as possible even in
1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * bulk mode, so that styled text is minimally disrupted.
1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * In incremental mode, a chunk that ends with offsets.limit
1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * must not be normalized.
1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * If it was known that the input text is not styled, then
1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * a bulk mode normalization could look like this:
1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString input, normalized;
1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t length = limit - start;
1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    input.releaseBuffer(length);
1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode status = U_ZERO_ERROR;
1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    fNorm2.normalize(input, normalized, status);
1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    text.handleReplaceBetween(start, limit, normalized);
1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t delta = normalized.length() - length;
1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    offsets.contextLimit += delta;
1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    offsets.limit += delta;
1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    offsets.start = limit + delta;
1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UErrorCode errorCode = U_ZERO_ERROR;
1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString segment;
1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString normalized;
1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar32 c = text.char32At(start);
1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    do {
1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        int32_t prev = start;
1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // Skip at least one character so we make progress.
1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        // c holds the character at start.
1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        segment.remove();
1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        do {
1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            segment.append(c);
1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            start += U16_LENGTH(c);
1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // stop in incremental mode when we reach the input limit
1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // in case there are additional characters that could change the
1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // normalization result
1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            start=prev;
1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        fNorm2.normalize(segment, normalized, errorCode);
1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(U_FAILURE(errorCode)) {
1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            break;
1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        if(segment != normalized) {
1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // replace the input chunk with its normalized form
1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            text.handleReplaceBetween(prev, start, normalized);
1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            // update all necessary indexes accordingly
1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            int32_t delta = normalized.length() - (start - prev);
1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            start += delta;
1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org            limit += delta;
1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        }
1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    } while(start < limit);
1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    offsets.start = start;
1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    offsets.contextLimit += limit - offsets.limit;
1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    offsets.limit = limit;
1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}
1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END
1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /* #if !UCONFIG_NO_TRANSLITERATION */
177