1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2009-2016, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  n2builder.cpp
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2009nov25
16*   created by: Markus W. Scherer
17*
18* Builds Normalizer2 data and writes a binary .nrm file.
19* For the file format see source/common/normalizer2impl.h.
20*/
21
22#include "unicode/utypes.h"
23#include "n2builder.h"
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#if U_HAVE_STD_STRING
29#include <vector>
30#endif
31#include "unicode/errorcode.h"
32#include "unicode/localpointer.h"
33#include "unicode/putil.h"
34#include "unicode/udata.h"
35#include "unicode/uniset.h"
36#include "unicode/unistr.h"
37#include "unicode/ustring.h"
38#include "charstr.h"
39#include "hash.h"
40#include "normalizer2impl.h"
41#include "toolutil.h"
42#include "unewdata.h"
43#include "utrie2.h"
44#include "uvectr32.h"
45#include "writesrc.h"
46
47#if !UCONFIG_NO_NORMALIZATION
48
49/* UDataInfo cf. udata.h */
50static UDataInfo dataInfo={
51    sizeof(UDataInfo),
52    0,
53
54    U_IS_BIG_ENDIAN,
55    U_CHARSET_FAMILY,
56    U_SIZEOF_UCHAR,
57    0,
58
59    { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
60    { 2, 0, 0, 0 },             /* formatVersion */
61    { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
62};
63
64U_NAMESPACE_BEGIN
65
66class HangulIterator {
67public:
68    struct Range {
69        UChar32 start, limit;
70        uint16_t norm16;
71    };
72
73    HangulIterator() : rangeIndex(0) {}
74    const Range *nextRange() {
75        if(rangeIndex<UPRV_LENGTHOF(ranges)) {
76            return ranges+rangeIndex++;
77        } else {
78            return NULL;
79        }
80    }
81    void reset() { rangeIndex=0; }
82private:
83    static const Range ranges[4];
84    int32_t rangeIndex;
85};
86
87const HangulIterator::Range HangulIterator::ranges[4]={
88    { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
89    { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
90    // JAMO_T_BASE+1: not U+11A7
91    { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
92    { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
93};
94
95struct CompositionPair {
96    CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
97    UChar32 trail, composite;
98};
99
100struct Norm {
101    enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
102
103    UBool hasMapping() const { return mappingType>REMOVED; }
104
105    // Requires hasMapping() and well-formed mapping.
106    void setMappingCP() {
107        UChar32 c;
108        if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
109            mappingCP=c;
110        } else {
111            mappingCP=U_SENTINEL;
112        }
113    }
114
115    const CompositionPair *getCompositionPairs(int32_t &length) const {
116        if(compositions==NULL) {
117            length=0;
118            return NULL;
119        } else {
120            length=compositions->size()/2;
121            return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
122        }
123    }
124
125    UnicodeString *mapping;
126    UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
127    UChar32 mappingCP;  // >=0 if mapping to 1 code point
128    int32_t mappingPhase;
129    MappingType mappingType;
130
131    UVector32 *compositions;  // (trail, composite) pairs
132    uint8_t cc;
133    UBool combinesBack;
134    UBool hasNoCompBoundaryAfter;
135
136    enum OffsetType {
137        OFFSET_NONE,
138        // Composition for back-combining character. Allowed, but not normally used.
139        OFFSET_MAYBE_YES,
140        // Composition for a starter that does not have a decomposition mapping.
141        OFFSET_YES_YES,
142        // Round-trip mapping & composition for a starter.
143        OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
144        // Round-trip mapping for a starter that itself does not combine-forward.
145        OFFSET_YES_NO_MAPPING_ONLY,
146        // One-way mapping.
147        OFFSET_NO_NO,
148        // Delta for an algorithmic one-way mapping.
149        OFFSET_DELTA
150    };
151    enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
152    int32_t offset;
153};
154
155class Normalizer2DBEnumerator {
156public:
157    Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
158    virtual ~Normalizer2DBEnumerator() {}
159    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
160    Normalizer2DBEnumerator *ptr() { return this; }
161protected:
162    Normalizer2DataBuilder &builder;
163};
164
165U_CDECL_BEGIN
166
167static UBool U_CALLCONV
168enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
169    return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
170}
171
172U_CDECL_END
173
174Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
175        phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
176        norm16TrieLength(0) {
177    memset(unicodeVersion, 0, sizeof(unicodeVersion));
178    normTrie=utrie2_open(0, 0, &errorCode);
179    normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
180    norms=allocNorm();  // unused Norm struct at index 0
181    memset(indexes, 0, sizeof(indexes));
182    memset(smallFCD, 0, sizeof(smallFCD));
183}
184
185Normalizer2DataBuilder::~Normalizer2DataBuilder() {
186    utrie2_close(normTrie);
187    int32_t normsLength=utm_countItems(normMem);
188    for(int32_t i=1; i<normsLength; ++i) {
189        delete norms[i].mapping;
190        delete norms[i].rawMapping;
191        delete norms[i].compositions;
192    }
193    utm_close(normMem);
194    utrie2_close(norm16Trie);
195}
196
197void
198Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
199    UVersionInfo nullVersion={ 0, 0, 0, 0 };
200    UVersionInfo version;
201    u_versionFromString(version, v);
202    if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
203        0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
204    ) {
205        char buffer[U_MAX_VERSION_STRING_LENGTH];
206        u_versionToString(unicodeVersion, buffer);
207        fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
208                buffer, v);
209        exit(U_ILLEGAL_ARGUMENT_ERROR);
210    }
211    memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
212}
213
214Norm *Normalizer2DataBuilder::allocNorm() {
215    Norm *p=(Norm *)utm_alloc(normMem);
216    norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
217    return p;
218}
219
220/* get an existing Norm unit */
221Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
222    uint32_t i=utrie2_get32(normTrie, c);
223    if(i==0) {
224        return NULL;
225    }
226    return norms+i;
227}
228
229const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
230    return norms[utrie2_get32(normTrie, c)];
231}
232
233/*
234 * get or create a Norm unit;
235 * get or create the intermediate trie entries for it as well
236 */
237Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
238    uint32_t i=utrie2_get32(normTrie, c);
239    if(i!=0) {
240        return norms+i;
241    } else {
242        /* allocate Norm */
243        Norm *p=allocNorm();
244        IcuToolErrorCode errorCode("gennorm2/createNorm()");
245        utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
246        return p;
247    }
248}
249
250Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
251    if(p!=NULL) {
252        if(p->mappingType!=Norm::NONE) {
253            if( overrideHandling==OVERRIDE_NONE ||
254                (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
255            ) {
256                fprintf(stderr,
257                        "error in gennorm2 phase %d: "
258                        "not permitted to override mapping for U+%04lX from phase %d\n",
259                        (int)phase, (long)c, (int)p->mappingPhase);
260                exit(U_INVALID_FORMAT_ERROR);
261            }
262            delete p->mapping;
263            p->mapping=NULL;
264        }
265        p->mappingPhase=phase;
266    }
267    return p;
268}
269
270void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
271    overrideHandling=oh;
272    ++phase;
273}
274
275void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
276    createNorm(c)->cc=cc;
277}
278
279uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
280    return getNormRef(c).cc;
281}
282
283static UBool isWellFormed(const UnicodeString &s) {
284    UErrorCode errorCode=U_ZERO_ERROR;
285    u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
286    return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
287}
288
289void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
290    if(!isWellFormed(m)) {
291        fprintf(stderr,
292                "error in gennorm2 phase %d: "
293                "illegal one-way mapping from U+%04lX to malformed string\n",
294                (int)phase, (long)c);
295        exit(U_INVALID_FORMAT_ERROR);
296    }
297    Norm *p=checkNormForMapping(createNorm(c), c);
298    p->mapping=new UnicodeString(m);
299    p->mappingType=Norm::ONE_WAY;
300    p->setMappingCP();
301}
302
303void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
304    if(U_IS_SURROGATE(c)) {
305        fprintf(stderr,
306                "error in gennorm2 phase %d: "
307                "illegal round-trip mapping from surrogate code point U+%04lX\n",
308                (int)phase, (long)c);
309        exit(U_INVALID_FORMAT_ERROR);
310    }
311    if(!isWellFormed(m)) {
312        fprintf(stderr,
313                "error in gennorm2 phase %d: "
314                "illegal round-trip mapping from U+%04lX to malformed string\n",
315                (int)phase, (long)c);
316        exit(U_INVALID_FORMAT_ERROR);
317    }
318    int32_t numCP=u_countChar32(m.getBuffer(), m.length());
319    if(numCP!=2) {
320        fprintf(stderr,
321                "error in gennorm2 phase %d: "
322                "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
323                (int)phase, (long)c, (int)numCP);
324        exit(U_INVALID_FORMAT_ERROR);
325    }
326    Norm *p=checkNormForMapping(createNorm(c), c);
327    p->mapping=new UnicodeString(m);
328    p->mappingType=Norm::ROUND_TRIP;
329    p->mappingCP=U_SENTINEL;
330}
331
332void Normalizer2DataBuilder::removeMapping(UChar32 c) {
333    Norm *p=checkNormForMapping(getNorm(c), c);
334    if(p!=NULL) {
335        p->mappingType=Norm::REMOVED;
336    }
337}
338
339class CompositionBuilder : public Normalizer2DBEnumerator {
340public:
341    CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
342    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
343        builder.addComposition(start, end, value);
344        return TRUE;
345    }
346};
347
348void
349Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
350    if(norms[value].mappingType==Norm::ROUND_TRIP) {
351        if(start!=end) {
352            fprintf(stderr,
353                    "gennorm2 error: same round-trip mapping for "
354                    "more than 1 code point U+%04lX..U+%04lX\n",
355                    (long)start, (long)end);
356            exit(U_INVALID_FORMAT_ERROR);
357        }
358        if(norms[value].cc!=0) {
359            fprintf(stderr,
360                    "gennorm2 error: "
361                    "U+%04lX has a round-trip mapping and ccc!=0, "
362                    "not possible in Unicode normalization\n",
363                    (long)start);
364            exit(U_INVALID_FORMAT_ERROR);
365        }
366        // setRoundTripMapping() ensured that there are exactly two code points.
367        const UnicodeString &m=*norms[value].mapping;
368        UChar32 lead=m.char32At(0);
369        UChar32 trail=m.char32At(m.length()-1);
370        if(getCC(lead)!=0) {
371            fprintf(stderr,
372                    "gennorm2 error: "
373                    "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
374                    "not possible in Unicode normalization\n",
375                    (long)start, (long)lead);
376            exit(U_INVALID_FORMAT_ERROR);
377        }
378        // Flag for trailing character.
379        createNorm(trail)->combinesBack=TRUE;
380        // Insert (trail, composite) pair into compositions list for the lead character.
381        IcuToolErrorCode errorCode("gennorm2/addComposition()");
382        Norm *leadNorm=createNorm(lead);
383        UVector32 *compositions=leadNorm->compositions;
384        int32_t i;
385        if(compositions==NULL) {
386            compositions=leadNorm->compositions=new UVector32(errorCode);
387            i=0;  // "insert" the first pair at index 0
388        } else {
389            // Insertion sort, and check for duplicate trail characters.
390            int32_t length;
391            const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
392            for(i=0; i<length; ++i) {
393                if(trail==pairs[i].trail) {
394                    fprintf(stderr,
395                            "gennorm2 error: same round-trip mapping for "
396                            "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
397                            (long)start, (long)lead, (long)trail);
398                    exit(U_INVALID_FORMAT_ERROR);
399                }
400                if(trail<pairs[i].trail) {
401                    break;
402                }
403            }
404        }
405        compositions->insertElementAt(trail, 2*i, errorCode);
406        compositions->insertElementAt(start, 2*i+1, errorCode);
407    }
408}
409
410UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
411                                                    uint8_t lowCC, uint8_t highCC) const {
412    if((highCC-lowCC)>=2) {
413        int32_t length;
414        const CompositionPair *pairs=norm.getCompositionPairs(length);
415        for(int32_t i=0; i<length; ++i) {
416            uint8_t trailCC=getCC(pairs[i].trail);
417            if(lowCC<trailCC && trailCC<highCC) {
418                return TRUE;
419            }
420        }
421    }
422    return FALSE;
423}
424
425UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
426    int32_t length;
427    const CompositionPair *pairs=norm.getCompositionPairs(length);
428    for(int32_t i=0; i<length; ++i) {
429        if(trail==pairs[i].trail) {
430            return pairs[i].composite;
431        }
432        if(trail<pairs[i].trail) {
433            break;
434        }
435    }
436    return U_SENTINEL;
437}
438
439class Decomposer : public Normalizer2DBEnumerator {
440public:
441    Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
442    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
443        didDecompose|=builder.decompose(start, end, value);
444        return TRUE;
445    }
446    UBool didDecompose;
447};
448
449UBool
450Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
451    if(norms[value].hasMapping()) {
452        Norm &norm=norms[value];
453        const UnicodeString &m=*norm.mapping;
454        UnicodeString *decomposed=NULL;
455        const UChar *s=m.getBuffer();
456        int32_t length=m.length();
457        int32_t prev, i=0;
458        UChar32 c;
459        while(i<length) {
460            prev=i;
461            U16_NEXT(s, i, length, c);
462            if(start<=c && c<=end) {
463                fprintf(stderr,
464                        "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
465                        (long)c);
466                exit(U_INVALID_FORMAT_ERROR);
467            }
468            const Norm &cNorm=getNormRef(c);
469            if(cNorm.hasMapping()) {
470                if(norm.mappingType==Norm::ROUND_TRIP) {
471                    if(prev==0) {
472                        if(cNorm.mappingType!=Norm::ROUND_TRIP) {
473                            fprintf(stderr,
474                                    "gennorm2 error: "
475                                    "U+%04lX's round-trip mapping's starter "
476                                    "U+%04lX one-way-decomposes, "
477                                    "not possible in Unicode normalization\n",
478                                    (long)start, (long)c);
479                            exit(U_INVALID_FORMAT_ERROR);
480                        }
481                        uint8_t myTrailCC=getCC(m.char32At(i));
482                        UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
483                        uint8_t cTrailCC=getCC(cTrailChar);
484                        if(cTrailCC>myTrailCC) {
485                            fprintf(stderr,
486                                    "gennorm2 error: "
487                                    "U+%04lX's round-trip mapping's starter "
488                                    "U+%04lX decomposes and the "
489                                    "inner/earlier tccc=%hu > outer/following tccc=%hu, "
490                                    "not possible in Unicode normalization\n",
491                                    (long)start, (long)c,
492                                    (short)cTrailCC, (short)myTrailCC);
493                            exit(U_INVALID_FORMAT_ERROR);
494                        }
495                    } else {
496                        fprintf(stderr,
497                                "gennorm2 error: "
498                                "U+%04lX's round-trip mapping's non-starter "
499                                "U+%04lX decomposes, "
500                                "not possible in Unicode normalization\n",
501                                (long)start, (long)c);
502                        exit(U_INVALID_FORMAT_ERROR);
503                    }
504                }
505                if(decomposed==NULL) {
506                    decomposed=new UnicodeString(m, 0, prev);
507                }
508                decomposed->append(*cNorm.mapping);
509            } else if(Hangul::isHangul(c)) {
510                UChar buffer[3];
511                int32_t hangulLength=Hangul::decompose(c, buffer);
512                if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
513                    fprintf(stderr,
514                            "gennorm2 error: "
515                            "U+%04lX's round-trip mapping's non-starter "
516                            "U+%04lX decomposes, "
517                            "not possible in Unicode normalization\n",
518                            (long)start, (long)c);
519                    exit(U_INVALID_FORMAT_ERROR);
520                }
521                if(decomposed==NULL) {
522                    decomposed=new UnicodeString(m, 0, prev);
523                }
524                decomposed->append(buffer, hangulLength);
525            } else if(decomposed!=NULL) {
526                decomposed->append(m, prev, i-prev);
527            }
528        }
529        if(decomposed!=NULL) {
530            if(norm.rawMapping==NULL) {
531                // Remember the original mapping when decomposing recursively.
532                norm.rawMapping=norm.mapping;
533            } else {
534                delete norm.mapping;
535            }
536            norm.mapping=decomposed;
537            // Not  norm.setMappingCP();  because the original mapping
538            // is most likely to be encodable as a delta.
539            return TRUE;
540        }
541    }
542    return FALSE;
543}
544
545class BuilderReorderingBuffer {
546public:
547    BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
548    void reset() {
549        fLength=0;
550        fLastStarterIndex=-1;
551        fDidReorder=FALSE;
552    }
553    int32_t length() const { return fLength; }
554    UBool isEmpty() const { return fLength==0; }
555    int32_t lastStarterIndex() const { return fLastStarterIndex; }
556    UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
557    uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
558    UBool didReorder() const { return fDidReorder; }
559    void append(UChar32 c, uint8_t cc) {
560        if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
561            if(cc==0) {
562                fLastStarterIndex=fLength;
563            }
564            fArray[fLength++]=(c<<8)|cc;
565            return;
566        }
567        // Let this character bubble back to its canonical order.
568        int32_t i=fLength-1;
569        while(i>fLastStarterIndex && ccAt(i)>cc) {
570            --i;
571        }
572        ++i;  // after the last starter or prevCC<=cc
573        // Move this and the following characters forward one to make space.
574        for(int32_t j=fLength; i<j; --j) {
575            fArray[j]=fArray[j-1];
576        }
577        fArray[i]=(c<<8)|cc;
578        ++fLength;
579        fDidReorder=TRUE;
580    }
581    void toString(UnicodeString &dest) {
582        dest.remove();
583        for(int32_t i=0; i<fLength; ++i) {
584            dest.append(charAt(i));
585        }
586    }
587    void setComposite(UChar32 composite, int32_t combMarkIndex) {
588        fArray[fLastStarterIndex]=composite<<8;
589        // Remove the combining mark that contributed to the composite.
590        --fLength;
591        while(combMarkIndex<fLength) {
592            fArray[combMarkIndex]=fArray[combMarkIndex+1];
593            ++combMarkIndex;
594        }
595    }
596private:
597    int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
598    int32_t fLength;
599    int32_t fLastStarterIndex;
600    UBool fDidReorder;
601};
602
603void
604Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
605    UnicodeString &m=*p->mapping;
606    int32_t length=m.length();
607    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
608        return;  // writeMapping() will complain about it and print the code point.
609    }
610    const UChar *s=m.getBuffer();
611    int32_t i=0;
612    UChar32 c;
613    while(i<length) {
614        U16_NEXT(s, i, length, c);
615        buffer.append(c, getCC(c));
616    }
617    if(buffer.didReorder()) {
618        buffer.toString(m);
619    }
620}
621
622/*
623 * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
624 * A starter character with a mapping does not have a composition boundary after it
625 * if the character itself combines-forward (which is tested by the caller of this function),
626 * or it is deleted (mapped to the empty string),
627 * or its mapping contains no starter,
628 * or the last starter combines-forward.
629 */
630UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
631    if(buffer.isEmpty()) {
632        return TRUE;  // maps-to-empty-string is no boundary of any kind
633    }
634    int32_t lastStarterIndex=buffer.lastStarterIndex();
635    if(lastStarterIndex<0) {
636        return TRUE;  // no starter
637    }
638    UChar32 starter=buffer.charAt(lastStarterIndex);
639    if( Hangul::isJamoL(starter) ||
640        (Hangul::isJamoV(starter) &&
641         0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
642    ) {
643        // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
644        // otherwise it is blocked.
645        return lastStarterIndex==buffer.length()-1;
646    }
647    // Note: There can be no Hangul syllable in the fully decomposed mapping.
648    const Norm *starterNorm=&getNormRef(starter);
649    if(starterNorm->compositions==NULL) {
650        return FALSE;  // the last starter does not combine forward
651    }
652    // Compose as far as possible, and see if further compositions are possible.
653    uint8_t prevCC=0;
654    for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
655        uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
656        if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
657            return TRUE;
658        }
659        if( prevCC<cc &&
660            (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
661        ) {
662            buffer.setComposite(starter, combMarkIndex);
663            starterNorm=&getNormRef(starter);
664            if(starterNorm->compositions==NULL) {
665                return FALSE;  // the composite does not combine further
666            }
667        } else {
668            prevCC=cc;
669            ++combMarkIndex;
670        }
671    }
672    // TRUE if the final, forward-combining starter is at the end.
673    return prevCC==0;
674}
675
676// Requires p->hasMapping().
677// Returns the offset of the "first unit" from the beginning of the extraData for c.
678// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
679int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
680    UnicodeString &m=*p->mapping;
681    int32_t length=m.length();
682    if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
683        fprintf(stderr,
684                "gennorm2 error: "
685                "mapping for U+%04lX longer than maximum of %d\n",
686                (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
687        exit(U_INVALID_FORMAT_ERROR);
688    }
689    int32_t leadCC, trailCC;
690    if(length==0) {
691        leadCC=trailCC=0;
692    } else {
693        leadCC=getCC(m.char32At(0));
694        trailCC=getCC(m.char32At(length-1));
695    }
696    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
697        fprintf(stderr,
698                "gennorm2 error: "
699                "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
700                (long)c);
701        exit(U_INVALID_FORMAT_ERROR);
702    }
703    // Write small-FCD data.
704    if((leadCC|trailCC)!=0) {
705        UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
706        smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
707    }
708    // Write the mapping & raw mapping extraData.
709    int32_t firstUnit=length|(trailCC<<8);
710    int32_t preMappingLength=0;
711    if(p->rawMapping!=NULL) {
712        UnicodeString &rm=*p->rawMapping;
713        int32_t rmLength=rm.length();
714        if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
715            fprintf(stderr,
716                    "gennorm2 error: "
717                    "raw mapping for U+%04lX longer than maximum of %d\n",
718                    (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
719            exit(U_INVALID_FORMAT_ERROR);
720        }
721        UChar rm0=rm.charAt(0);
722        if( rmLength==length-1 &&
723            // 99: overlong substring lengths get pinned to remainder lengths anyway
724            0==rm.compare(1, 99, m, 2, 99) &&
725            rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
726        ) {
727            // Compression:
728            // rawMapping=rm0+mapping.substring(2) -> store only rm0
729            //
730            // The raw mapping is the same as the final mapping after replacing
731            // the final mapping's first two code units with the raw mapping's first one.
732            // In this case, we store only that first unit, rm0.
733            // This helps with a few hundred mappings.
734            dataString.append(rm0);
735            preMappingLength=1;
736        } else {
737            // Store the raw mapping with its length.
738            dataString.append(rm);
739            dataString.append((UChar)rmLength);
740            preMappingLength=rmLength+1;
741        }
742        firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
743    }
744    int32_t cccLccc=p->cc|(leadCC<<8);
745    if(cccLccc!=0) {
746        dataString.append((UChar)cccLccc);
747        ++preMappingLength;
748        firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
749    }
750    if(p->hasNoCompBoundaryAfter) {
751        firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
752    }
753    dataString.append((UChar)firstUnit);
754    dataString.append(m);
755    return preMappingLength;
756}
757
758// Requires p->compositions!=NULL.
759void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
760    if(p->cc!=0) {
761        fprintf(stderr,
762                "gennorm2 error: "
763                "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
764                (long)c);
765        exit(U_INVALID_FORMAT_ERROR);
766    }
767    int32_t length;
768    const CompositionPair *pairs=p->getCompositionPairs(length);
769    for(int32_t i=0; i<length; ++i) {
770        const CompositionPair &pair=pairs[i];
771        // 22 bits for the composite character and whether it combines forward.
772        UChar32 compositeAndFwd=pair.composite<<1;
773        if(getNormRef(pair.composite).compositions!=NULL) {
774            compositeAndFwd|=1;  // The composite character also combines-forward.
775        }
776        // Encode most pairs in two units and some in three.
777        int32_t firstUnit, secondUnit, thirdUnit;
778        if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
779            if(compositeAndFwd<=0xffff) {
780                firstUnit=pair.trail<<1;
781                secondUnit=compositeAndFwd;
782                thirdUnit=-1;
783            } else {
784                firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
785                secondUnit=compositeAndFwd>>16;
786                thirdUnit=compositeAndFwd;
787            }
788        } else {
789            firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
790                       (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
791                      Normalizer2Impl::COMP_1_TRIPLE;
792            secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
793                       (compositeAndFwd>>16);
794            thirdUnit=compositeAndFwd;
795        }
796        // Set the high bit of the first unit if this is the last composition pair.
797        if(i==(length-1)) {
798            firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
799        }
800        dataString.append((UChar)firstUnit).append((UChar)secondUnit);
801        if(thirdUnit>=0) {
802            dataString.append((UChar)thirdUnit);
803        }
804    }
805}
806
807class ExtraDataWriter : public Normalizer2DBEnumerator {
808public:
809    ExtraDataWriter(Normalizer2DataBuilder &b) :
810        Normalizer2DBEnumerator(b),
811        yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
812        yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
813    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
814        if(value!=0) {
815            if(start!=end) {
816                fprintf(stderr,
817                        "gennorm2 error: unexpected shared data for "
818                        "multiple code points U+%04lX..U+%04lX\n",
819                        (long)start, (long)end);
820                exit(U_INTERNAL_PROGRAM_ERROR);
821            }
822            builder.writeExtraData(start, value, *this);
823        }
824        return TRUE;
825    }
826    UnicodeString maybeYesCompositions;
827    UnicodeString yesYesCompositions;
828    UnicodeString yesNoMappingsAndCompositions;
829    UnicodeString yesNoMappingsOnly;
830    UnicodeString noNoMappings;
831    Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
832};
833
834void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
835    Norm *p=norms+value;
836    if(!p->hasMapping()) {
837        // Write small-FCD data.
838        // There is similar code in writeMapping() for characters that do have a mapping.
839        if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
840            fprintf(stderr,
841                    "gennorm2 error: "
842                    "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
843                    (long)c);
844            exit(U_INVALID_FORMAT_ERROR);
845        }
846        if(p->cc!=0) {
847            UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
848            smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
849        }
850    }
851    if(p->combinesBack) {
852        if(p->hasMapping()) {
853            fprintf(stderr,
854                    "gennorm2 error: "
855                    "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
856                    (long)c);
857            exit(U_INVALID_FORMAT_ERROR);
858        }
859        if(p->compositions!=NULL) {
860            p->offset=
861                (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
862                Norm::OFFSET_MAYBE_YES;
863            writeCompositions(c, p, writer.maybeYesCompositions);
864        }
865    } else if(!p->hasMapping()) {
866        if(p->compositions!=NULL) {
867            p->offset=
868                (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
869                Norm::OFFSET_YES_YES;
870            writeCompositions(c, p, writer.yesYesCompositions);
871        }
872    } else if(p->mappingType==Norm::ROUND_TRIP) {
873        if(p->compositions!=NULL) {
874            int32_t offset=writer.yesNoMappingsAndCompositions.length()+
875                           writeMapping(c, p, writer.yesNoMappingsAndCompositions);
876            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
877            writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
878        } else {
879            int32_t offset=writer.yesNoMappingsOnly.length()+
880                           writeMapping(c, p, writer.yesNoMappingsOnly);
881            p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
882        }
883    } else /* one-way */ {
884        if(p->compositions!=NULL) {
885            fprintf(stderr,
886                    "gennorm2 error: "
887                    "U+%04lX combines-forward and has a one-way mapping, "
888                    "not possible in Unicode normalization\n",
889                    (long)c);
890            exit(U_INVALID_FORMAT_ERROR);
891        }
892        if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
893            // Try a compact, algorithmic encoding.
894            // Only for ccc=0, because we can't store additional information
895            // and we do not recursively follow an algorithmic encoding for access to the ccc.
896            //
897            // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
898            // if the mappingCP decomposes further, to ensure that there is a place to store it.
899            // We want to see that the final mapping does not have exactly 1 code point,
900            // or else we would have to recursively ensure that the final mapping is stored
901            // in normal extraData.
902            if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
903                int32_t delta=p->mappingCP-c;
904                if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
905                    p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
906                }
907            }
908        }
909        if(p->offset==0) {
910            int32_t oldNoNoLength=writer.noNoMappings.length();
911            int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
912            UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
913            int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
914            if(previousOffset!=0) {
915                // Duplicate, remove the new units and point to the old ones.
916                writer.noNoMappings.truncate(oldNoNoLength);
917                p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
918            } else {
919                // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
920                IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
921                writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
922                p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
923            }
924        }
925    }
926}
927
928class Norm16Writer : public Normalizer2DBEnumerator {
929public:
930    Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
931    virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
932        builder.writeNorm16(start, end, value);
933        return TRUE;
934    }
935};
936
937void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
938    if(value!=0) {
939        const Norm *p=norms+value;
940        int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
941        int32_t norm16=0;
942        UBool isDecompNo=FALSE;
943        UBool isCompNoMaybe=FALSE;
944        switch(p->offset&Norm::OFFSET_MASK) {
945        case Norm::OFFSET_NONE:
946            // No mapping, no compositions list.
947            if(p->combinesBack) {
948                norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
949                isDecompNo=(UBool)(p->cc!=0);
950                isCompNoMaybe=TRUE;
951            } else if(p->cc!=0) {
952                norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
953                isDecompNo=isCompNoMaybe=TRUE;
954            }
955            break;
956        case Norm::OFFSET_MAYBE_YES:
957            norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
958            isCompNoMaybe=TRUE;
959            break;
960        case Norm::OFFSET_YES_YES:
961            norm16=offset;
962            break;
963        case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
964            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
965            isDecompNo=TRUE;
966            break;
967        case Norm::OFFSET_YES_NO_MAPPING_ONLY:
968            norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
969            isDecompNo=TRUE;
970            break;
971        case Norm::OFFSET_NO_NO:
972            norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
973            isDecompNo=isCompNoMaybe=TRUE;
974            break;
975        case Norm::OFFSET_DELTA:
976            norm16=getCenterNoNoDelta()+offset;
977            isDecompNo=isCompNoMaybe=TRUE;
978            break;
979        default:  // Should not occur.
980            exit(U_INTERNAL_PROGRAM_ERROR);
981        }
982        IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
983        utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
984        if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
985            indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
986        }
987        if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
988            indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
989        }
990    }
991}
992
993void Normalizer2DataBuilder::setHangulData() {
994    HangulIterator hi;
995    const HangulIterator::Range *range;
996    // Check that none of the Hangul/Jamo code points have data.
997    while((range=hi.nextRange())!=NULL) {
998        for(UChar32 c=range->start; c<range->limit; ++c) {
999            if(utrie2_get32(norm16Trie, c)!=0) {
1000                fprintf(stderr,
1001                        "gennorm2 error: "
1002                        "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
1003                        (long)c);
1004                exit(U_INVALID_FORMAT_ERROR);
1005            }
1006        }
1007    }
1008    // Set data for algorithmic runtime handling.
1009    IcuToolErrorCode errorCode("gennorm2/setHangulData()");
1010    hi.reset();
1011    while((range=hi.nextRange())!=NULL) {
1012        uint16_t norm16=range->norm16;
1013        if(norm16==0) {
1014            norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
1015            if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
1016                indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
1017            }
1018        } else {
1019            if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
1020                indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
1021            }
1022        }
1023        utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
1024        errorCode.assertSuccess();
1025    }
1026}
1027
1028U_CDECL_BEGIN
1029
1030static UBool U_CALLCONV
1031enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1032    uint32_t *pMaxValue=(uint32_t *)context;
1033    if(value>*pMaxValue) {
1034        *pMaxValue=value;
1035    }
1036    return TRUE;
1037}
1038
1039U_CDECL_END
1040
1041void Normalizer2DataBuilder::processData() {
1042    IcuToolErrorCode errorCode("gennorm2/processData()");
1043    norm16Trie=utrie2_open(0, 0, errorCode);
1044    errorCode.assertSuccess();
1045
1046    utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
1047
1048    Decomposer decomposer(*this);
1049    do {
1050        decomposer.didDecompose=FALSE;
1051        utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
1052    } while(decomposer.didDecompose);
1053
1054    BuilderReorderingBuffer buffer;
1055    int32_t normsLength=utm_countItems(normMem);
1056    for(int32_t i=1; i<normsLength; ++i) {
1057        // Set the hasNoCompBoundaryAfter flag for use by the last code branch
1058        // in Normalizer2Impl::hasCompBoundaryAfter().
1059        // For details see the comments on hasNoCompBoundaryAfter(buffer).
1060        const Norm &norm=norms[i];
1061        if(norm.hasMapping()) {
1062            if(norm.compositions!=NULL) {
1063                norms[i].hasNoCompBoundaryAfter=TRUE;
1064            } else {
1065                buffer.reset();
1066                reorder(norms+i, buffer);
1067                norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
1068            }
1069        }
1070    }
1071
1072    indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
1073    indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
1074
1075    ExtraDataWriter extraDataWriter(*this);
1076    utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
1077
1078    extraData=extraDataWriter.maybeYesCompositions;
1079    extraData.append(extraDataWriter.yesYesCompositions).
1080              append(extraDataWriter.yesNoMappingsAndCompositions).
1081              append(extraDataWriter.yesNoMappingsOnly).
1082              append(extraDataWriter.noNoMappings);
1083    // Pad to even length for 4-byte alignment of following data.
1084    if(extraData.length()&1) {
1085        extraData.append((UChar)0);
1086    }
1087
1088    indexes[Normalizer2Impl::IX_MIN_YES_NO]=
1089        extraDataWriter.yesYesCompositions.length();
1090    indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
1091        indexes[Normalizer2Impl::IX_MIN_YES_NO]+
1092        extraDataWriter.yesNoMappingsAndCompositions.length();
1093    indexes[Normalizer2Impl::IX_MIN_NO_NO]=
1094        indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
1095        extraDataWriter.yesNoMappingsOnly.length();
1096    indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
1097        indexes[Normalizer2Impl::IX_MIN_NO_NO]+
1098        extraDataWriter.noNoMappings.length();
1099    indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
1100        Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
1101        extraDataWriter.maybeYesCompositions.length();
1102
1103    int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
1104    if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
1105        fprintf(stderr,
1106                "gennorm2 error: "
1107                "data structure overflow, too much mapping composition data\n");
1108        exit(U_BUFFER_OVERFLOW_ERROR);
1109    }
1110
1111    utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
1112
1113    setHangulData();
1114
1115    // Look for the "worst" norm16 value of any supplementary code point
1116    // corresponding to a lead surrogate, and set it as that surrogate's value.
1117    // Enables quick check inner loops to look at only code units.
1118    //
1119    // We could be more sophisticated:
1120    // We could collect a bit set for whether there are values in the different
1121    // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
1122    // and select the best value that only breaks the composition and/or decomposition
1123    // inner loops if necessary.
1124    // However, that seems like overkill for an optimization for supplementary characters.
1125    for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1126        uint32_t maxValue=utrie2_get32(norm16Trie, lead);
1127        utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
1128        if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
1129            maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
1130        ) {
1131            // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
1132            // Otherwise it might end up at something like JAMO_VT which stays in
1133            // the inner decomposition quick check loop.
1134            maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
1135        }
1136        utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
1137    }
1138
1139    // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
1140    // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
1141    // which is harmless.
1142    // As a result, the minimum code points are always BMP code points.
1143    int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
1144    if(minCP>=0x10000) {
1145        indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
1146    }
1147    minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
1148    if(minCP>=0x10000) {
1149        indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
1150    }
1151
1152    utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
1153    norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
1154    if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
1155        fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
1156                errorCode.errorName());
1157        exit(errorCode.reset());
1158    }
1159    errorCode.reset();
1160
1161    int32_t offset=(int32_t)sizeof(indexes);
1162    indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
1163    offset+=norm16TrieLength;
1164    indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
1165    offset+=extraData.length()*2;
1166    indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
1167    offset+=sizeof(smallFCD);
1168    int32_t totalSize=offset;
1169    for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
1170        indexes[i]=totalSize;
1171    }
1172
1173    if(beVerbose) {
1174        printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
1175        printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
1176        printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
1177        printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
1178        printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
1179        printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
1180        printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
1181        printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
1182        printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
1183        printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
1184        printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
1185    }
1186
1187    UVersionInfo nullVersion={ 0, 0, 0, 0 };
1188    if(0==memcmp(nullVersion, unicodeVersion, 4)) {
1189        u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
1190    }
1191    memcpy(dataInfo.dataVersion, unicodeVersion, 4);
1192}
1193
1194void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
1195    processData();
1196
1197    IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
1198    LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1199    utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1200    errorCode.assertSuccess();
1201
1202    UNewDataMemory *pData=
1203        udata_create(NULL, NULL, filename, &dataInfo,
1204                     haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
1205    if(errorCode.isFailure()) {
1206        fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
1207                filename, errorCode.errorName());
1208        exit(errorCode.reset());
1209    }
1210    udata_writeBlock(pData, indexes, sizeof(indexes));
1211    udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
1212    udata_writeUString(pData, extraData.getBuffer(), extraData.length());
1213    udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
1214    int32_t writtenSize=udata_finish(pData, errorCode);
1215    if(errorCode.isFailure()) {
1216        fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
1217        exit(errorCode.reset());
1218    }
1219    int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
1220    if(writtenSize!=totalSize) {
1221        fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
1222            (long)writtenSize, (long)totalSize);
1223        exit(U_INTERNAL_PROGRAM_ERROR);
1224    }
1225}
1226
1227void
1228Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
1229    processData();
1230
1231    IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
1232    const char *basename=findBasename(filename);
1233    CharString path(filename, (int32_t)(basename-filename), errorCode);
1234    CharString dataName(basename, errorCode);
1235    const char *extension=strrchr(basename, '.');
1236    if(extension!=NULL) {
1237        dataName.truncate((int32_t)(extension-basename));
1238    }
1239    errorCode.assertSuccess();
1240
1241    LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1242    utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1243    errorCode.assertSuccess();
1244
1245    FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp");
1246    if(f==NULL) {
1247        fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
1248                filename);
1249        exit(U_FILE_ACCESS_ERROR);
1250        return;
1251    }
1252    fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f);
1253    char line[100];
1254    sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data());
1255    usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n");
1256    sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data());
1257    usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n");
1258    sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n",
1259            dataName.data());
1260    usrc_writeArray(f,
1261        line,
1262        indexes, 32, Normalizer2Impl::IX_COUNT,
1263        "\n};\n\n");
1264    sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data());
1265    usrc_writeUTrie2Arrays(f,
1266        line, NULL,
1267        norm16Trie,
1268        "\n};\n\n");
1269    sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data());
1270    usrc_writeArray(f,
1271        line,
1272        extraData.getBuffer(), 16, extraData.length(),
1273        "\n};\n\n");
1274    sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data());
1275    usrc_writeArray(f,
1276        line,
1277        smallFCD, 8, sizeof(smallFCD),
1278        "\n};\n\n");
1279    /*fputs(  // TODO
1280        "static const UCaseProps %s_singleton={\n"
1281        "  NULL,\n"
1282        "  %s_indexes,\n"
1283        "  %s_extraData,\n"
1284        "  %s_smallFCD,\n",
1285        f);*/
1286    sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data());
1287    char line2[100];
1288    sprintf(line2, "%s_trieIndex", dataName.data());
1289    usrc_writeUTrie2Struct(f,
1290        line,
1291        norm16Trie, line2, NULL,
1292        "};\n");
1293    fputs("\n#endif  // INCLUDED_FROM_NORMALIZER2_CPP\n", f);
1294    fclose(f);
1295}
1296
1297U_NAMESPACE_END
1298
1299#endif /* #if !UCONFIG_NO_NORMALIZATION */
1300
1301/*
1302 * Hey, Emacs, please set the following:
1303 *
1304 * Local Variables:
1305 * indent-tabs-mode: nil
1306 * End:
1307 */
1308