150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/*
250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*******************************************************************************
350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*
4f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius*   Copyright (C) 2009-2014, International Business Machines
550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   Corporation and others.  All Rights Reserved.
650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*
750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*******************************************************************************
850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   file name:  n2builder.cpp
950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   encoding:   US-ASCII
1050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   tab size:   8 (not used)
1150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   indentation:4
1250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*
1350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   created on: 2009nov25
1450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*   created by: Markus W. Scherer
1550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*
1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Builds Normalizer2 data and writes a binary .nrm file.
1750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* For the file format see source/common/normalizer2impl.h.
1850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*/
1950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
2050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/utypes.h"
2127f654740f2a26ad62a5c155af9199af9e69b889claireho#include "n2builder.h"
2250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
2350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include <stdio.h>
2450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include <stdlib.h>
2550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include <string.h>
2650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if U_HAVE_STD_STRING
2750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include <vector>
2850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif
2950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/errorcode.h"
3050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/localpointer.h"
3150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/putil.h"
3250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/udata.h"
3350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/uniset.h"
3450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/unistr.h"
3550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/ustring.h"
36f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "charstr.h"
3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "hash.h"
3850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "normalizer2impl.h"
3950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "toolutil.h"
4050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unewdata.h"
4150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "utrie2.h"
4227f654740f2a26ad62a5c155af9199af9e69b889claireho#include "uvectr32.h"
43f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "writesrc.h"
4450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
4550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_NORMALIZATION
4650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
4750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* UDataInfo cf. udata.h */
4850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UDataInfo dataInfo={
4950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    sizeof(UDataInfo),
5050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    0,
5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
5250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    U_IS_BIG_ENDIAN,
5350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    U_CHARSET_FAMILY,
5450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    U_SIZEOF_UCHAR,
5550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    0,
5650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
5750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
5883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    { 2, 0, 0, 0 },             /* formatVersion */
5950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
6050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
6150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
6250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_BEGIN
6350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
6450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass HangulIterator {
6550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic:
6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    struct Range {
6750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UChar32 start, limit;
6850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        uint16_t norm16;
6950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    };
7050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
7150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    HangulIterator() : rangeIndex(0) {}
7250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const Range *nextRange() {
73f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        if(rangeIndex<UPRV_LENGTHOF(ranges)) {
7450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            return ranges+rangeIndex++;
7550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        } else {
7650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            return NULL;
7750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
7850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
7950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    void reset() { rangeIndex=0; }
8050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprivate:
8150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    static const Range ranges[4];
8250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t rangeIndex;
8350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
8450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
8550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst HangulIterator::Range HangulIterator::ranges[4]={
8650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
8850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // JAMO_T_BASE+1: not U+11A7
8950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
9050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
9150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
9250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
9350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostruct CompositionPair {
9450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
9550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UChar32 trail, composite;
9650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
9750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
9850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostruct Norm {
9950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
10050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
10150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UBool hasMapping() const { return mappingType>REMOVED; }
10250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
10350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // Requires hasMapping() and well-formed mapping.
10450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    void setMappingCP() {
10550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UChar32 c;
10650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
10750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            mappingCP=c;
10850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        } else {
10950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            mappingCP=U_SENTINEL;
11050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
11150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
11250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
11327f654740f2a26ad62a5c155af9199af9e69b889claireho    const CompositionPair *getCompositionPairs(int32_t &length) const {
11427f654740f2a26ad62a5c155af9199af9e69b889claireho        if(compositions==NULL) {
11527f654740f2a26ad62a5c155af9199af9e69b889claireho            length=0;
11627f654740f2a26ad62a5c155af9199af9e69b889claireho            return NULL;
11727f654740f2a26ad62a5c155af9199af9e69b889claireho        } else {
11827f654740f2a26ad62a5c155af9199af9e69b889claireho            length=compositions->size()/2;
11927f654740f2a26ad62a5c155af9199af9e69b889claireho            return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
12027f654740f2a26ad62a5c155af9199af9e69b889claireho        }
12127f654740f2a26ad62a5c155af9199af9e69b889claireho    }
12227f654740f2a26ad62a5c155af9199af9e69b889claireho
12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString *mapping;
12483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
12550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UChar32 mappingCP;  // >=0 if mapping to 1 code point
12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t mappingPhase;
12750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    MappingType mappingType;
12850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
12927f654740f2a26ad62a5c155af9199af9e69b889claireho    UVector32 *compositions;  // (trail, composite) pairs
13050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    uint8_t cc;
13150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UBool combinesBack;
13250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UBool hasNoCompBoundaryAfter;
13350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
13450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    enum OffsetType {
13583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        OFFSET_NONE,
13683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // Composition for back-combining character. Allowed, but not normally used.
13783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        OFFSET_MAYBE_YES,
13883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // Composition for a starter that does not have a decomposition mapping.
13983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        OFFSET_YES_YES,
14083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // Round-trip mapping & composition for a starter.
14183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
14283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // Round-trip mapping for a starter that itself does not combine-forward.
14383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        OFFSET_YES_NO_MAPPING_ONLY,
14483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // One-way mapping.
14583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        OFFSET_NO_NO,
14683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // Delta for an algorithmic one-way mapping.
14750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        OFFSET_DELTA
14850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    };
14950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
15050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t offset;
15150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
15250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
15350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass Normalizer2DBEnumerator {
15450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic:
15550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
15650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    virtual ~Normalizer2DBEnumerator() {}
15750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
15850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Normalizer2DBEnumerator *ptr() { return this; }
15950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprotected:
16050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Normalizer2DataBuilder &builder;
16150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
16250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
16350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_BEGIN
16450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
16550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UBool U_CALLCONV
16650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoenumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
16750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
16850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
17050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_END
17150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
17250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
173f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
174f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        norm16TrieLength(0) {
17550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    memset(unicodeVersion, 0, sizeof(unicodeVersion));
17650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    normTrie=utrie2_open(0, 0, &errorCode);
17750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
17850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    norms=allocNorm();  // unused Norm struct at index 0
17950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    memset(indexes, 0, sizeof(indexes));
18083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    memset(smallFCD, 0, sizeof(smallFCD));
18150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
18250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
18350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::~Normalizer2DataBuilder() {
18450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    utrie2_close(normTrie);
18550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t normsLength=utm_countItems(normMem);
18650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    for(int32_t i=1; i<normsLength; ++i) {
18750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        delete norms[i].mapping;
18883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        delete norms[i].rawMapping;
18950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        delete norms[i].compositions;
19050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
19150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    utm_close(normMem);
19250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    utrie2_close(norm16Trie);
19350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
19450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
19550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid
19650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::setUnicodeVersion(const char *v) {
19783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    UVersionInfo nullVersion={ 0, 0, 0, 0 };
19883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    UVersionInfo version;
19983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    u_versionFromString(version, v);
20083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
20183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
20283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    ) {
20383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        char buffer[U_MAX_VERSION_STRING_LENGTH];
20483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        u_versionToString(unicodeVersion, buffer);
20583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
20683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                buffer, v);
20783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        exit(U_ILLEGAL_ARGUMENT_ERROR);
20883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    }
20983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
21050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
21150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
21250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNorm *Normalizer2DataBuilder::allocNorm() {
21350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Norm *p=(Norm *)utm_alloc(normMem);
21450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
21550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return p;
21650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
21750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
21850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* get an existing Norm unit */
21950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNorm *Normalizer2DataBuilder::getNorm(UChar32 c) {
22050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    uint32_t i=utrie2_get32(normTrie, c);
22150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(i==0) {
22250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return NULL;
22350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
22450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return norms+i;
22550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
22650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
22750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
22850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return norms[utrie2_get32(normTrie, c)];
22950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
23050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
23150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/*
23250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * get or create a Norm unit;
23350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * get or create the intermediate trie entries for it as well
23450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */
23550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNorm *Normalizer2DataBuilder::createNorm(UChar32 c) {
23650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    uint32_t i=utrie2_get32(normTrie, c);
23750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(i!=0) {
23850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return norms+i;
23950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } else {
24050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        /* allocate Norm */
24150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        Norm *p=allocNorm();
24250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        IcuToolErrorCode errorCode("gennorm2/createNorm()");
24350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
24450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return p;
24550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
24650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
24750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
24850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNorm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
24950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(p!=NULL) {
25050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(p->mappingType!=Norm::NONE) {
25150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if( overrideHandling==OVERRIDE_NONE ||
25250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
25350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            ) {
25450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                fprintf(stderr,
25550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        "error in gennorm2 phase %d: "
25650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        "not permitted to override mapping for U+%04lX from phase %d\n",
25750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        (int)phase, (long)c, (int)p->mappingPhase);
25850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                exit(U_INVALID_FORMAT_ERROR);
25950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
26050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            delete p->mapping;
26150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            p->mapping=NULL;
26250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
26350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        p->mappingPhase=phase;
26450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
26550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return p;
26650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
26750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
26850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
26950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    overrideHandling=oh;
27050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    ++phase;
27150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
27250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
27350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
27450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    createNorm(c)->cc=cc;
27550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
27650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
27750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehouint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
27850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return getNormRef(c).cc;
27950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
28050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
28150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UBool isWellFormed(const UnicodeString &s) {
28250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UErrorCode errorCode=U_ZERO_ERROR;
28350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
28450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
28550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
28650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
28750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
28850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(!isWellFormed(m)) {
28950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr,
29050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "error in gennorm2 phase %d: "
29150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "illegal one-way mapping from U+%04lX to malformed string\n",
29250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (int)phase, (long)c);
29350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_INVALID_FORMAT_ERROR);
29450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
29550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Norm *p=checkNormForMapping(createNorm(c), c);
29650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    p->mapping=new UnicodeString(m);
29750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    p->mappingType=Norm::ONE_WAY;
29850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    p->setMappingCP();
29950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
30050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
30150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
30250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(U_IS_SURROGATE(c)) {
30350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr,
30450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "error in gennorm2 phase %d: "
30550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "illegal round-trip mapping from surrogate code point U+%04lX\n",
30650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (int)phase, (long)c);
30750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_INVALID_FORMAT_ERROR);
30850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
30950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(!isWellFormed(m)) {
31050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr,
31150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "error in gennorm2 phase %d: "
31250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "illegal round-trip mapping from U+%04lX to malformed string\n",
31350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (int)phase, (long)c);
31450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_INVALID_FORMAT_ERROR);
31550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
31650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t numCP=u_countChar32(m.getBuffer(), m.length());
31750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(numCP!=2) {
31850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr,
31950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "error in gennorm2 phase %d: "
32050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
32150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (int)phase, (long)c, (int)numCP);
32250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_INVALID_FORMAT_ERROR);
32350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
32450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Norm *p=checkNormForMapping(createNorm(c), c);
32550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    p->mapping=new UnicodeString(m);
32650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    p->mappingType=Norm::ROUND_TRIP;
32750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    p->mappingCP=U_SENTINEL;
32850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
32950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
33050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::removeMapping(UChar32 c) {
33150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Norm *p=checkNormForMapping(getNorm(c), c);
33250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(p!=NULL) {
33350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        p->mappingType=Norm::REMOVED;
33450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
33550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
33650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
33750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass CompositionBuilder : public Normalizer2DBEnumerator {
33850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic:
33950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
34050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
34150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        builder.addComposition(start, end, value);
34250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return TRUE;
34350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
34450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
34550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
34650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid
34750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
34850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(norms[value].mappingType==Norm::ROUND_TRIP) {
34950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(start!=end) {
35050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            fprintf(stderr,
35150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "gennorm2 error: same round-trip mapping for "
35250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "more than 1 code point U+%04lX..U+%04lX\n",
35350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    (long)start, (long)end);
35450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            exit(U_INVALID_FORMAT_ERROR);
35550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
35650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(norms[value].cc!=0) {
35750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            fprintf(stderr,
35850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "gennorm2 error: "
35950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "U+%04lX has a round-trip mapping and ccc!=0, "
36050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "not possible in Unicode normalization\n",
36150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    (long)start);
36250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            exit(U_INVALID_FORMAT_ERROR);
36350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
36450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // setRoundTripMapping() ensured that there are exactly two code points.
36550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        const UnicodeString &m=*norms[value].mapping;
36650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UChar32 lead=m.char32At(0);
36750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UChar32 trail=m.char32At(m.length()-1);
36850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(getCC(lead)!=0) {
36950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            fprintf(stderr,
37050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "gennorm2 error: "
37150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
37250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "not possible in Unicode normalization\n",
37350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    (long)start, (long)lead);
37450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            exit(U_INVALID_FORMAT_ERROR);
37550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
37650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Flag for trailing character.
37750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        createNorm(trail)->combinesBack=TRUE;
37850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Insert (trail, composite) pair into compositions list for the lead character.
37927f654740f2a26ad62a5c155af9199af9e69b889claireho        IcuToolErrorCode errorCode("gennorm2/addComposition()");
38050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        Norm *leadNorm=createNorm(lead);
38127f654740f2a26ad62a5c155af9199af9e69b889claireho        UVector32 *compositions=leadNorm->compositions;
38227f654740f2a26ad62a5c155af9199af9e69b889claireho        int32_t i;
38350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(compositions==NULL) {
38427f654740f2a26ad62a5c155af9199af9e69b889claireho            compositions=leadNorm->compositions=new UVector32(errorCode);
38527f654740f2a26ad62a5c155af9199af9e69b889claireho            i=0;  // "insert" the first pair at index 0
38650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        } else {
38750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            // Insertion sort, and check for duplicate trail characters.
38827f654740f2a26ad62a5c155af9199af9e69b889claireho            int32_t length;
38927f654740f2a26ad62a5c155af9199af9e69b889claireho            const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
39027f654740f2a26ad62a5c155af9199af9e69b889claireho            for(i=0; i<length; ++i) {
39127f654740f2a26ad62a5c155af9199af9e69b889claireho                if(trail==pairs[i].trail) {
39250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    fprintf(stderr,
39350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            "gennorm2 error: same round-trip mapping for "
39450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
39550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            (long)start, (long)lead, (long)trail);
39650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    exit(U_INVALID_FORMAT_ERROR);
39750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                }
39827f654740f2a26ad62a5c155af9199af9e69b889claireho                if(trail<pairs[i].trail) {
39950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    break;
40050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                }
40150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
40250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
40327f654740f2a26ad62a5c155af9199af9e69b889claireho        compositions->insertElementAt(trail, 2*i, errorCode);
40427f654740f2a26ad62a5c155af9199af9e69b889claireho        compositions->insertElementAt(start, 2*i+1, errorCode);
40550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
40650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
40750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
40850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
40950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                                    uint8_t lowCC, uint8_t highCC) const {
41027f654740f2a26ad62a5c155af9199af9e69b889claireho    if((highCC-lowCC)>=2) {
41127f654740f2a26ad62a5c155af9199af9e69b889claireho        int32_t length;
41227f654740f2a26ad62a5c155af9199af9e69b889claireho        const CompositionPair *pairs=norm.getCompositionPairs(length);
41327f654740f2a26ad62a5c155af9199af9e69b889claireho        for(int32_t i=0; i<length; ++i) {
41427f654740f2a26ad62a5c155af9199af9e69b889claireho            uint8_t trailCC=getCC(pairs[i].trail);
41550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(lowCC<trailCC && trailCC<highCC) {
41650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                return TRUE;
41750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
41850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
41950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
42050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return FALSE;
42150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
42250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
42350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
42427f654740f2a26ad62a5c155af9199af9e69b889claireho    int32_t length;
42527f654740f2a26ad62a5c155af9199af9e69b889claireho    const CompositionPair *pairs=norm.getCompositionPairs(length);
42627f654740f2a26ad62a5c155af9199af9e69b889claireho    for(int32_t i=0; i<length; ++i) {
42727f654740f2a26ad62a5c155af9199af9e69b889claireho        if(trail==pairs[i].trail) {
42827f654740f2a26ad62a5c155af9199af9e69b889claireho            return pairs[i].composite;
42927f654740f2a26ad62a5c155af9199af9e69b889claireho        }
43027f654740f2a26ad62a5c155af9199af9e69b889claireho        if(trail<pairs[i].trail) {
43127f654740f2a26ad62a5c155af9199af9e69b889claireho            break;
43250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
43350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
43450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return U_SENTINEL;
43550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
43650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
43750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass Decomposer : public Normalizer2DBEnumerator {
43850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic:
43950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
44050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
44150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        didDecompose|=builder.decompose(start, end, value);
44250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return TRUE;
44350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
44450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UBool didDecompose;
44550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
44650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
44750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool
44850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
44950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(norms[value].hasMapping()) {
45083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        Norm &norm=norms[value];
45183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        const UnicodeString &m=*norm.mapping;
45250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UnicodeString *decomposed=NULL;
45350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        const UChar *s=m.getBuffer();
45450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        int32_t length=m.length();
45550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        int32_t prev, i=0;
45650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UChar32 c;
45750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        while(i<length) {
45850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            prev=i;
45950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            U16_NEXT(s, i, length, c);
46050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(start<=c && c<=end) {
46150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                fprintf(stderr,
46250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
46350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        (long)c);
46450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                exit(U_INVALID_FORMAT_ERROR);
46550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
46650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            const Norm &cNorm=getNormRef(c);
46750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(cNorm.hasMapping()) {
46883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                if(norm.mappingType==Norm::ROUND_TRIP) {
46950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    if(prev==0) {
47050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        if(cNorm.mappingType!=Norm::ROUND_TRIP) {
47150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            fprintf(stderr,
47250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "gennorm2 error: "
47350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "U+%04lX's round-trip mapping's starter "
47450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "U+%04lX one-way-decomposes, "
47550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "not possible in Unicode normalization\n",
47650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    (long)start, (long)c);
47750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            exit(U_INVALID_FORMAT_ERROR);
47850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        }
47950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        uint8_t myTrailCC=getCC(m.char32At(i));
48050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
48150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        uint8_t cTrailCC=getCC(cTrailChar);
48250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        if(cTrailCC>myTrailCC) {
48350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            fprintf(stderr,
48450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "gennorm2 error: "
48550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "U+%04lX's round-trip mapping's starter "
48650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "U+%04lX decomposes and the "
48750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "inner/earlier tccc=%hu > outer/following tccc=%hu, "
48850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    "not possible in Unicode normalization\n",
48950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    (long)start, (long)c,
49050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                    (short)cTrailCC, (short)myTrailCC);
49150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            exit(U_INVALID_FORMAT_ERROR);
49250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        }
49350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    } else {
49450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        fprintf(stderr,
49550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                "gennorm2 error: "
49650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                "U+%04lX's round-trip mapping's non-starter "
49750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                "U+%04lX decomposes, "
49850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                "not possible in Unicode normalization\n",
49950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                                (long)start, (long)c);
50050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        exit(U_INVALID_FORMAT_ERROR);
50150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    }
50250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                }
50350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                if(decomposed==NULL) {
50450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    decomposed=new UnicodeString(m, 0, prev);
50550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                }
50650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                decomposed->append(*cNorm.mapping);
50750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            } else if(Hangul::isHangul(c)) {
50850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                UChar buffer[3];
50950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                int32_t hangulLength=Hangul::decompose(c, buffer);
51083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
51150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    fprintf(stderr,
51250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            "gennorm2 error: "
51350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            "U+%04lX's round-trip mapping's non-starter "
51450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            "U+%04lX decomposes, "
51550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            "not possible in Unicode normalization\n",
51650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                            (long)start, (long)c);
51750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    exit(U_INVALID_FORMAT_ERROR);
51850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                }
51950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                if(decomposed==NULL) {
52050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    decomposed=new UnicodeString(m, 0, prev);
52150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                }
52250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                decomposed->append(buffer, hangulLength);
52350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            } else if(decomposed!=NULL) {
52450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                decomposed->append(m, prev, i-prev);
52550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
52650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
52750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(decomposed!=NULL) {
52883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            if(norm.rawMapping==NULL) {
52983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                // Remember the original mapping when decomposing recursively.
53083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                norm.rawMapping=norm.mapping;
53183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            } else {
53283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                delete norm.mapping;
53383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            }
53483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            norm.mapping=decomposed;
53583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // Not  norm.setMappingCP();  because the original mapping
53650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            // is most likely to be encodable as a delta.
53750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            return TRUE;
53850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
53950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
54050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return FALSE;
54150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
54250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
54350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass BuilderReorderingBuffer {
54450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic:
54550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
54650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    void reset() {
54750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fLength=0;
54850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fLastStarterIndex=-1;
54950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fDidReorder=FALSE;
55050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
55150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t length() const { return fLength; }
55250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UBool isEmpty() const { return fLength==0; }
55350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t lastStarterIndex() const { return fLastStarterIndex; }
55450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
55550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
55650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UBool didReorder() const { return fDidReorder; }
55750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    void append(UChar32 c, uint8_t cc) {
55850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
55950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(cc==0) {
56050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                fLastStarterIndex=fLength;
56150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
56250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            fArray[fLength++]=(c<<8)|cc;
56350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            return;
56450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
56550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Let this character bubble back to its canonical order.
56650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        int32_t i=fLength-1;
56750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        while(i>fLastStarterIndex && ccAt(i)>cc) {
56850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            --i;
56950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
57050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        ++i;  // after the last starter or prevCC<=cc
57150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Move this and the following characters forward one to make space.
57250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        for(int32_t j=fLength; i<j; --j) {
57350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            fArray[j]=fArray[j-1];
57450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
57550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fArray[i]=(c<<8)|cc;
57650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        ++fLength;
57750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fDidReorder=TRUE;
57850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
57950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    void toString(UnicodeString &dest) {
58050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        dest.remove();
58150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        for(int32_t i=0; i<fLength; ++i) {
58250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            dest.append(charAt(i));
58350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
58450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
58550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    void setComposite(UChar32 composite, int32_t combMarkIndex) {
58650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fArray[fLastStarterIndex]=composite<<8;
58750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Remove the combining mark that contributed to the composite.
58850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        --fLength;
58950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        while(combMarkIndex<fLength) {
59050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            fArray[combMarkIndex]=fArray[combMarkIndex+1];
59150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            ++combMarkIndex;
59250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
59350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
59450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprivate:
59550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
59650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t fLength;
59750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t fLastStarterIndex;
59850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UBool fDidReorder;
59950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
60050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
60150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid
60250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
60350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString &m=*p->mapping;
60450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t length=m.length();
60550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
60650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return;  // writeMapping() will complain about it and print the code point.
60750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
60850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const UChar *s=m.getBuffer();
60950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t i=0;
61050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UChar32 c;
61150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    while(i<length) {
61250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        U16_NEXT(s, i, length, c);
61350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        buffer.append(c, getCC(c));
61450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
61550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(buffer.didReorder()) {
61650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        buffer.toString(m);
61750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
61850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
61950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
62083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius/*
62183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
62283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * A starter character with a mapping does not have a composition boundary after it
62383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * if the character itself combines-forward (which is tested by the caller of this function),
62483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * or it is deleted (mapped to the empty string),
62583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * or its mapping contains no starter,
62683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * or the last starter combines-forward.
62783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius */
62850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
62950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(buffer.isEmpty()) {
63083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        return TRUE;  // maps-to-empty-string is no boundary of any kind
63150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
63250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t lastStarterIndex=buffer.lastStarterIndex();
63350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(lastStarterIndex<0) {
63450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return TRUE;  // no starter
63550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
63650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UChar32 starter=buffer.charAt(lastStarterIndex);
63750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if( Hangul::isJamoL(starter) ||
63850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        (Hangul::isJamoV(starter) &&
63950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho         0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
64050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    ) {
64150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
64250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // otherwise it is blocked.
64350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return lastStarterIndex==buffer.length()-1;
64450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
64583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    // Note: There can be no Hangul syllable in the fully decomposed mapping.
64650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const Norm *starterNorm=&getNormRef(starter);
64750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(starterNorm->compositions==NULL) {
64850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return FALSE;  // the last starter does not combine forward
64950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
65050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // Compose as far as possible, and see if further compositions are possible.
65150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    uint8_t prevCC=0;
65250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
65350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
65450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
65550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            return TRUE;
65650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
65750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if( prevCC<cc &&
65850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
65950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        ) {
66050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            buffer.setComposite(starter, combMarkIndex);
66150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            starterNorm=&getNormRef(starter);
66250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(starterNorm->compositions==NULL) {
66350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                return FALSE;  // the composite does not combine further
66450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
66550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        } else {
66650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            prevCC=cc;
66750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            ++combMarkIndex;
66850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
66950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
67050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // TRUE if the final, forward-combining starter is at the end.
67150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return prevCC==0;
67250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
67350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
67450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Requires p->hasMapping().
67583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius// Returns the offset of the "first unit" from the beginning of the extraData for c.
67683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
67783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Corneliusint32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
67850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString &m=*p->mapping;
67950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t length=m.length();
68050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
68150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr,
68250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "gennorm2 error: "
68350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "mapping for U+%04lX longer than maximum of %d\n",
68450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
68550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_INVALID_FORMAT_ERROR);
68650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
68750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t leadCC, trailCC;
68850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(length==0) {
68950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        leadCC=trailCC=0;
69050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } else {
69150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        leadCC=getCC(m.char32At(0));
69250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        trailCC=getCC(m.char32At(length-1));
69350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
69450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
69550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr,
69650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "gennorm2 error: "
69750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
69850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (long)c);
69950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_INVALID_FORMAT_ERROR);
70050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
70183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    // Write small-FCD data.
70283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    if((leadCC|trailCC)!=0) {
70383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
70483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
70583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    }
70683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    // Write the mapping & raw mapping extraData.
70750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t firstUnit=length|(trailCC<<8);
70883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    int32_t preMappingLength=0;
70983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    if(p->rawMapping!=NULL) {
71083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        UnicodeString &rm=*p->rawMapping;
71183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        int32_t rmLength=rm.length();
71283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
71383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            fprintf(stderr,
71483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                    "gennorm2 error: "
71583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                    "raw mapping for U+%04lX longer than maximum of %d\n",
71683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                    (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
71783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            exit(U_INVALID_FORMAT_ERROR);
71883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        }
71983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        UChar rm0=rm.charAt(0);
72083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        if( rmLength==length-1 &&
72183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // 99: overlong substring lengths get pinned to remainder lengths anyway
72283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            0==rm.compare(1, 99, m, 2, 99) &&
72383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
72483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        ) {
72583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // Compression:
72683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // rawMapping=rm0+mapping.substring(2) -> store only rm0
72783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            //
72883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // The raw mapping is the same as the final mapping after replacing
72983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // the final mapping's first two code units with the raw mapping's first one.
73083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // In this case, we store only that first unit, rm0.
73183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // This helps with a few hundred mappings.
73283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            dataString.append(rm0);
73383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            preMappingLength=1;
73483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        } else {
73583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // Store the raw mapping with its length.
73683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            dataString.append(rm);
73783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            dataString.append((UChar)rmLength);
73883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            preMappingLength=rmLength+1;
73983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        }
74083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
74150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
74283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    int32_t cccLccc=p->cc|(leadCC<<8);
74383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    if(cccLccc!=0) {
74483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        dataString.append((UChar)cccLccc);
74583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        ++preMappingLength;
74683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
74750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
74850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(p->hasNoCompBoundaryAfter) {
74950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
75050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
75150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    dataString.append((UChar)firstUnit);
75250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    dataString.append(m);
75383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    return preMappingLength;
75450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
75550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
75650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Requires p->compositions!=NULL.
75750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
75850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(p->cc!=0) {
75950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr,
76050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "gennorm2 error: "
76150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
76250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (long)c);
76350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_INVALID_FORMAT_ERROR);
76450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
76527f654740f2a26ad62a5c155af9199af9e69b889claireho    int32_t length;
76627f654740f2a26ad62a5c155af9199af9e69b889claireho    const CompositionPair *pairs=p->getCompositionPairs(length);
76750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    for(int32_t i=0; i<length; ++i) {
76827f654740f2a26ad62a5c155af9199af9e69b889claireho        const CompositionPair &pair=pairs[i];
76950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // 22 bits for the composite character and whether it combines forward.
77050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UChar32 compositeAndFwd=pair.composite<<1;
77150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(getNormRef(pair.composite).compositions!=NULL) {
77250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            compositeAndFwd|=1;  // The composite character also combines-forward.
77350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
77450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Encode most pairs in two units and some in three.
77550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        int32_t firstUnit, secondUnit, thirdUnit;
77650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
77750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(compositeAndFwd<=0xffff) {
77850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                firstUnit=pair.trail<<1;
77950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                secondUnit=compositeAndFwd;
78050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                thirdUnit=-1;
78150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            } else {
78250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
78350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                secondUnit=compositeAndFwd>>16;
78450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                thirdUnit=compositeAndFwd;
78550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
78650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        } else {
78750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
78850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                       (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
78950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                      Normalizer2Impl::COMP_1_TRIPLE;
79050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
79150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                       (compositeAndFwd>>16);
79250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            thirdUnit=compositeAndFwd;
79350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
79450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        // Set the high bit of the first unit if this is the last composition pair.
79550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(i==(length-1)) {
79650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
79750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
79850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        dataString.append((UChar)firstUnit).append((UChar)secondUnit);
79950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(thirdUnit>=0) {
80050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            dataString.append((UChar)thirdUnit);
80150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
80250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
80350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
80450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
80550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass ExtraDataWriter : public Normalizer2DBEnumerator {
80650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic:
80750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    ExtraDataWriter(Normalizer2DataBuilder &b) :
80850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        Normalizer2DBEnumerator(b),
80950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
81083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
81150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
81250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(value!=0) {
81350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(start!=end) {
81450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                fprintf(stderr,
81550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        "gennorm2 error: unexpected shared data for "
81650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        "multiple code points U+%04lX..U+%04lX\n",
81750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        (long)start, (long)end);
81850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                exit(U_INTERNAL_PROGRAM_ERROR);
81950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
82050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            builder.writeExtraData(start, value, *this);
82150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
82250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return TRUE;
82350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
82450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString maybeYesCompositions;
82550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString yesYesCompositions;
82683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    UnicodeString yesNoMappingsAndCompositions;
82783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    UnicodeString yesNoMappingsOnly;
82850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UnicodeString noNoMappings;
82950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
83050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
83150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
83250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
83350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Norm *p=norms+value;
83483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    if(!p->hasMapping()) {
83583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // Write small-FCD data.
83683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // There is similar code in writeMapping() for characters that do have a mapping.
83783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
83883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            fprintf(stderr,
83983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                    "gennorm2 error: "
84083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                    "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
84183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                    (long)c);
84283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            exit(U_INVALID_FORMAT_ERROR);
84383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        }
84483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        if(p->cc!=0) {
84583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
84683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
84783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        }
84883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    }
84950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(p->combinesBack) {
85050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(p->hasMapping()) {
85150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            fprintf(stderr,
85250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "gennorm2 error: "
85350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
85450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    (long)c);
85550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            exit(U_INVALID_FORMAT_ERROR);
85650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
85750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(p->compositions!=NULL) {
85850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            p->offset=
85950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
86050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                Norm::OFFSET_MAYBE_YES;
86150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            writeCompositions(c, p, writer.maybeYesCompositions);
86250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
86350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } else if(!p->hasMapping()) {
86450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(p->compositions!=NULL) {
86550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            p->offset=
86650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
86750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                Norm::OFFSET_YES_YES;
86850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            writeCompositions(c, p, writer.yesYesCompositions);
86950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
87050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } else if(p->mappingType==Norm::ROUND_TRIP) {
87150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(p->compositions!=NULL) {
87283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            int32_t offset=writer.yesNoMappingsAndCompositions.length()+
87383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                           writeMapping(c, p, writer.yesNoMappingsAndCompositions);
87483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
87583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
87683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        } else {
87783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            int32_t offset=writer.yesNoMappingsOnly.length()+
87883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                           writeMapping(c, p, writer.yesNoMappingsOnly);
87983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
88050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
88150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } else /* one-way */ {
88250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(p->compositions!=NULL) {
88350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            fprintf(stderr,
88450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "gennorm2 error: "
88550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "U+%04lX combines-forward and has a one-way mapping, "
88650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    "not possible in Unicode normalization\n",
88750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    (long)c);
88850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            exit(U_INVALID_FORMAT_ERROR);
88950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
89050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
89150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            // Try a compact, algorithmic encoding.
89283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // Only for ccc=0, because we can't store additional information
89383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // and we do not recursively follow an algorithmic encoding for access to the ccc.
89483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            //
89583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
89683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // if the mappingCP decomposes further, to ensure that there is a place to store it.
89783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // We want to see that the final mapping does not have exactly 1 code point,
89883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // or else we would have to recursively ensure that the final mapping is stored
89983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            // in normal extraData.
90083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
90150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                int32_t delta=p->mappingCP-c;
90250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
90350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                    p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
90450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                }
90550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
90650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
90750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(p->offset==0) {
90850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            int32_t oldNoNoLength=writer.noNoMappings.length();
90983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
91050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
91150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
91250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(previousOffset!=0) {
91350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                // Duplicate, remove the new units and point to the old ones.
91450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                writer.noNoMappings.truncate(oldNoNoLength);
91583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
91650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            } else {
91750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
91850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
91983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
92083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
92150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
92250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
92350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
92450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
92550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
92650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass Norm16Writer : public Normalizer2DBEnumerator {
92750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic:
92850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
92950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
93050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        builder.writeNorm16(start, end, value);
93150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        return TRUE;
93250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
93350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho};
93450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
93550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
93650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(value!=0) {
93750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        const Norm *p=norms+value;
93850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
93950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        int32_t norm16=0;
94050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UBool isDecompNo=FALSE;
94150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        UBool isCompNoMaybe=FALSE;
94250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        switch(p->offset&Norm::OFFSET_MASK) {
94350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        case Norm::OFFSET_NONE:
94450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            // No mapping, no compositions list.
94550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(p->combinesBack) {
94650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
94750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                isDecompNo=(UBool)(p->cc!=0);
94850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                isCompNoMaybe=TRUE;
94950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            } else if(p->cc!=0) {
95050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
95150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                isDecompNo=isCompNoMaybe=TRUE;
95250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
95350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
95450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        case Norm::OFFSET_MAYBE_YES:
95550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
95650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            isCompNoMaybe=TRUE;
95750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
95850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        case Norm::OFFSET_YES_YES:
95950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            norm16=offset;
96050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
96183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
96250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
96350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            isDecompNo=TRUE;
96450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
96583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        case Norm::OFFSET_YES_NO_MAPPING_ONLY:
96683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
96783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            isDecompNo=TRUE;
96883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            break;
96950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        case Norm::OFFSET_NO_NO:
97050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
97150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            isDecompNo=isCompNoMaybe=TRUE;
97250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
97350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        case Norm::OFFSET_DELTA:
97450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            norm16=getCenterNoNoDelta()+offset;
97550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            isDecompNo=isCompNoMaybe=TRUE;
97650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            break;
97750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        default:  // Should not occur.
97850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            exit(U_INTERNAL_PROGRAM_ERROR);
97950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
98050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
98150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
98250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
98350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
98450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
98550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
98650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
98750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
98850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
98950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
99050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
99150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setHangulData() {
99250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    HangulIterator hi;
99350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    const HangulIterator::Range *range;
99450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // Check that none of the Hangul/Jamo code points have data.
99550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    while((range=hi.nextRange())!=NULL) {
99650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        for(UChar32 c=range->start; c<range->limit; ++c) {
99750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(utrie2_get32(norm16Trie, c)!=0) {
99850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                fprintf(stderr,
99950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        "gennorm2 error: "
100050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
100150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                        (long)c);
100250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                exit(U_INVALID_FORMAT_ERROR);
100350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
100450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
100550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
100650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // Set data for algorithmic runtime handling.
100750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    IcuToolErrorCode errorCode("gennorm2/setHangulData()");
100850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    hi.reset();
100950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    while((range=hi.nextRange())!=NULL) {
101050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        uint16_t norm16=range->norm16;
101150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if(norm16==0) {
101250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
101350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
101450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
101550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
101650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        } else {
101750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
101850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
101950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            }
102050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
102150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
102250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        errorCode.assertSuccess();
102350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
102450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
102550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
102650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_BEGIN
102750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
102850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UBool U_CALLCONV
102950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoenumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
103050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    uint32_t *pMaxValue=(uint32_t *)context;
103150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(value>*pMaxValue) {
103250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        *pMaxValue=value;
103350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
103450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    return TRUE;
103550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
103650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
103750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_END
103850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
103950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::processData() {
104050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    IcuToolErrorCode errorCode("gennorm2/processData()");
104150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    norm16Trie=utrie2_open(0, 0, errorCode);
104250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    errorCode.assertSuccess();
104350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
104450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
104550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
104650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    Decomposer decomposer(*this);
104750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    do {
104850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        decomposer.didDecompose=FALSE;
104950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
105050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    } while(decomposer.didDecompose);
105150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
105250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    BuilderReorderingBuffer buffer;
105350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t normsLength=utm_countItems(normMem);
105450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    for(int32_t i=1; i<normsLength; ++i) {
105583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // Set the hasNoCompBoundaryAfter flag for use by the last code branch
105683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // in Normalizer2Impl::hasCompBoundaryAfter().
105783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        // For details see the comments on hasNoCompBoundaryAfter(buffer).
105883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        const Norm &norm=norms[i];
105983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        if(norm.hasMapping()) {
106083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            if(norm.compositions!=NULL) {
106183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                norms[i].hasNoCompBoundaryAfter=TRUE;
106283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            } else {
106383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                buffer.reset();
106483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                reorder(norms+i, buffer);
106583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius                norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
106683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius            }
106750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
106850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
106950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
107050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
107150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
107250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
107350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    ExtraDataWriter extraDataWriter(*this);
107450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
107550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
107650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    extraData=extraDataWriter.maybeYesCompositions;
107750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    extraData.append(extraDataWriter.yesYesCompositions).
107883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius              append(extraDataWriter.yesNoMappingsAndCompositions).
107983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius              append(extraDataWriter.yesNoMappingsOnly).
108050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho              append(extraDataWriter.noNoMappings);
108150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // Pad to even length for 4-byte alignment of following data.
108250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(extraData.length()&1) {
108350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        extraData.append((UChar)0);
108450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
108550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
108650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    indexes[Normalizer2Impl::IX_MIN_YES_NO]=
108750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        extraDataWriter.yesYesCompositions.length();
108883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
108950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        indexes[Normalizer2Impl::IX_MIN_YES_NO]+
109083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        extraDataWriter.yesNoMappingsAndCompositions.length();
109183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    indexes[Normalizer2Impl::IX_MIN_NO_NO]=
109283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
109383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        extraDataWriter.yesNoMappingsOnly.length();
109450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
109550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        indexes[Normalizer2Impl::IX_MIN_NO_NO]+
109650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        extraDataWriter.noNoMappings.length();
109750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
109850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
109950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        extraDataWriter.maybeYesCompositions.length();
110050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
110150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
110250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
110350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr,
110450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "gennorm2 error: "
110550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                "data structure overflow, too much mapping composition data\n");
110650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_BUFFER_OVERFLOW_ERROR);
110750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
110850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
110950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
111050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
111150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    setHangulData();
111250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
111350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // Look for the "worst" norm16 value of any supplementary code point
111450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // corresponding to a lead surrogate, and set it as that surrogate's value.
111550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // Enables quick check inner loops to look at only code units.
111650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    //
111750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // We could be more sophisticated:
111850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // We could collect a bit set for whether there are values in the different
111950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
112050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // and select the best value that only breaks the composition and/or decomposition
112150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // inner loops if necessary.
112250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // However, that seems like overkill for an optimization for supplementary characters.
112350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    for(UChar lead=0xd800; lead<0xdc00; ++lead) {
112450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        uint32_t maxValue=utrie2_get32(norm16Trie, lead);
112550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
112650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
112750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
112850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        ) {
112950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
113050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            // Otherwise it might end up at something like JAMO_VT which stays in
113150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            // the inner decomposition quick check loop.
113250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
113350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        }
113450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
113550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
113650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
113750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
113850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
113950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // which is harmless.
114050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    // As a result, the minimum code points are always BMP code points.
114150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
114250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(minCP>=0x10000) {
114350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
114450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
114550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
114650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(minCP>=0x10000) {
114750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
114850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
114950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
115050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
1151f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
115250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
115350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
115450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                errorCode.errorName());
115550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(errorCode.reset());
115650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
115750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    errorCode.reset();
115850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
115950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t offset=(int32_t)sizeof(indexes);
116050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
116150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    offset+=norm16TrieLength;
116250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
116383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    offset+=extraData.length()*2;
116483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
116583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    offset+=sizeof(smallFCD);
116683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    int32_t totalSize=offset;
116783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
116850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        indexes[i]=totalSize;
116950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
117050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
117150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(beVerbose) {
117250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
117350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
117483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
117550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
117650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
117750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
117850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
117983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
118050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
118150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
118250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
118350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
118450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
118583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    UVersionInfo nullVersion={ 0, 0, 0, 0 };
118683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    if(0==memcmp(nullVersion, unicodeVersion, 4)) {
118783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
118883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    }
118950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    memcpy(dataInfo.dataVersion, unicodeVersion, 4);
1190f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius}
1191f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
1192f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
1193f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    processData();
1194f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
1195f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
1196f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1197f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1198f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    errorCode.assertSuccess();
1199f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
120050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    UNewDataMemory *pData=
120150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        udata_create(NULL, NULL, filename, &dataInfo,
120250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                     haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
120350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(errorCode.isFailure()) {
120450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
120550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho                filename, errorCode.errorName());
120650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(errorCode.reset());
120750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
120850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    udata_writeBlock(pData, indexes, sizeof(indexes));
120950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
121050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    udata_writeUString(pData, extraData.getBuffer(), extraData.length());
121183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
121250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    int32_t writtenSize=udata_finish(pData, errorCode);
121350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(errorCode.isFailure()) {
121450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
121550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(errorCode.reset());
121650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
1217f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
121850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    if(writtenSize!=totalSize) {
121950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
122050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho            (long)writtenSize, (long)totalSize);
122150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho        exit(U_INTERNAL_PROGRAM_ERROR);
122250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho    }
122350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}
122450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
1225f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid
1226f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusNormalizer2DataBuilder::writeCSourceFile(const char *filename) {
1227f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    processData();
1228f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
1229f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
1230f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    const char *basename=findBasename(filename);
1231f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    CharString path(filename, (int32_t)(basename-filename), errorCode);
1232f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    CharString dataName(basename, errorCode);
1233f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    const char *extension=strrchr(basename, '.');
1234f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    if(extension!=NULL) {
1235f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        dataName.truncate((int32_t)(extension-basename));
1236f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    }
1237f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    errorCode.assertSuccess();
1238f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
1239f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1240f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1241f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    errorCode.assertSuccess();
1242f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
1243f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp");
1244f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    if(f==NULL) {
1245f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
1246f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius                filename);
1247f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        exit(U_FILE_ACCESS_ERROR);
1248f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        return;
1249f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    }
1250f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    char line[100];
1251f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data());
1252f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n");
1253f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data());
1254f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n");
1255f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n",
1256f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius            dataName.data());
1257f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    usrc_writeArray(f,
1258f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        line,
1259f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        indexes, 32, Normalizer2Impl::IX_COUNT,
1260f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "\n};\n\n");
1261f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data());
1262f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    usrc_writeUTrie2Arrays(f,
1263f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        line, NULL,
1264f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        norm16Trie,
1265f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "\n};\n\n");
1266f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data());
1267f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    usrc_writeArray(f,
1268f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        line,
1269f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        extraData.getBuffer(), 16, extraData.length(),
1270f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "\n};\n\n");
1271f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data());
1272f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    usrc_writeArray(f,
1273f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        line,
1274f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        smallFCD, 8, sizeof(smallFCD),
1275f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "\n};\n\n");
1276f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    /*fputs(  // TODO
1277f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "static const UCaseProps %s_singleton={\n"
1278f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "  NULL,\n"
1279f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "  %s_indexes,\n"
1280f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "  %s_extraData,\n"
1281f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "  %s_smallFCD,\n",
1282f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        f);*/
1283f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data());
1284f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    char line2[100];
1285f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    sprintf(line2, "%s_trieIndex", dataName.data());
1286f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    usrc_writeUTrie2Struct(f,
1287f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        line,
1288f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        norm16Trie, line2, NULL,
1289f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        "};\n");
1290f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    fclose(f);
1291f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius}
1292f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius
129350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_END
129450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
129550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif /* #if !UCONFIG_NO_NORMALIZATION */
129650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho
129750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/*
129850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Hey, Emacs, please set the following:
129950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *
130050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Local Variables:
130150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * indent-tabs-mode: nil
130250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * End:
130350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */
1304