1/*
2*******************************************************************************
3*
4*   Copyright (C) 2009-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  normalizer2impl.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2009nov22
14*   created by: Markus W. Scherer
15*/
16
17#ifndef __NORMALIZER2IMPL_H__
18#define __NORMALIZER2IMPL_H__
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_NORMALIZATION
23
24#include "unicode/normalizer2.h"
25#include "unicode/udata.h"
26#include "unicode/unistr.h"
27#include "unicode/unorm.h"
28#include "mutex.h"
29#include "uset_imp.h"
30#include "utrie2.h"
31
32U_NAMESPACE_BEGIN
33
34class CanonIterData;
35
36class Hangul {
37public:
38    /* Korean Hangul and Jamo constants */
39    enum {
40        JAMO_L_BASE=0x1100,     /* "lead" jamo */
41        JAMO_V_BASE=0x1161,     /* "vowel" jamo */
42        JAMO_T_BASE=0x11a7,     /* "trail" jamo */
43
44        HANGUL_BASE=0xac00,
45
46        JAMO_L_COUNT=19,
47        JAMO_V_COUNT=21,
48        JAMO_T_COUNT=28,
49
50        JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,
51
52        HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
53        HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
54    };
55
56    static inline UBool isHangul(UChar32 c) {
57        return HANGUL_BASE<=c && c<HANGUL_LIMIT;
58    }
59    static inline UBool
60    isHangulWithoutJamoT(UChar c) {
61        c-=HANGUL_BASE;
62        return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
63    }
64    static inline UBool isJamoL(UChar32 c) {
65        return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
66    }
67    static inline UBool isJamoV(UChar32 c) {
68        return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
69    }
70
71    /**
72     * Decomposes c, which must be a Hangul syllable, into buffer
73     * and returns the length of the decomposition (2 or 3).
74     */
75    static inline int32_t decompose(UChar32 c, UChar buffer[3]) {
76        c-=HANGUL_BASE;
77        UChar32 c2=c%JAMO_T_COUNT;
78        c/=JAMO_T_COUNT;
79        buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
80        buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
81        if(c2==0) {
82            return 2;
83        } else {
84            buffer[2]=(UChar)(JAMO_T_BASE+c2);
85            return 3;
86        }
87    }
88private:
89    Hangul();  // no instantiation
90};
91
92class Normalizer2Impl;
93
94class ReorderingBuffer : public UMemory {
95public:
96    ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
97        impl(ni), str(dest),
98        start(NULL), reorderStart(NULL), limit(NULL),
99        remainingCapacity(0), lastCC(0) {}
100    ~ReorderingBuffer() {
101        if(start!=NULL) {
102            str.releaseBuffer((int32_t)(limit-start));
103        }
104    }
105    UBool init(int32_t destCapacity, UErrorCode &errorCode);
106
107    UBool isEmpty() const { return start==limit; }
108    int32_t length() const { return (int32_t)(limit-start); }
109    UChar *getStart() { return start; }
110    UChar *getLimit() { return limit; }
111    uint8_t getLastCC() const { return lastCC; }
112
113    UBool equals(const UChar *start, const UChar *limit) const;
114
115    // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
116    void setLastChar(UChar c) {
117        *(limit-1)=c;
118    }
119
120    UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
121        return (c<=0xffff) ?
122            appendBMP((UChar)c, cc, errorCode) :
123            appendSupplementary(c, cc, errorCode);
124    }
125    // s must be in NFD, otherwise change the implementation.
126    UBool append(const UChar *s, int32_t length,
127                 uint8_t leadCC, uint8_t trailCC,
128                 UErrorCode &errorCode);
129    UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) {
130        if(remainingCapacity==0 && !resize(1, errorCode)) {
131            return FALSE;
132        }
133        if(lastCC<=cc || cc==0) {
134            *limit++=c;
135            lastCC=cc;
136            if(cc<=1) {
137                reorderStart=limit;
138            }
139        } else {
140            insert(c, cc);
141        }
142        --remainingCapacity;
143        return TRUE;
144    }
145    UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
146    UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode);
147    void remove();
148    void removeSuffix(int32_t suffixLength);
149    void setReorderingLimit(UChar *newLimit) {
150        remainingCapacity+=(int32_t)(limit-newLimit);
151        reorderStart=limit=newLimit;
152        lastCC=0;
153    }
154private:
155    /*
156     * TODO: Revisit whether it makes sense to track reorderStart.
157     * It is set to after the last known character with cc<=1,
158     * which stops previousCC() before it reads that character and looks up its cc.
159     * previousCC() is normally only called from insert().
160     * In other words, reorderStart speeds up the insertion of a combining mark
161     * into a multi-combining mark sequence where it does not belong at the end.
162     * This might not be worth the trouble.
163     * On the other hand, it's not a huge amount of trouble.
164     *
165     * We probably need it for UNORM_SIMPLE_APPEND.
166     */
167
168    UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
169    void insert(UChar32 c, uint8_t cc);
170    static void writeCodePoint(UChar *p, UChar32 c) {
171        if(c<=0xffff) {
172            *p=(UChar)c;
173        } else {
174            p[0]=U16_LEAD(c);
175            p[1]=U16_TRAIL(c);
176        }
177    }
178    UBool resize(int32_t appendLength, UErrorCode &errorCode);
179
180    const Normalizer2Impl &impl;
181    UnicodeString &str;
182    UChar *start, *reorderStart, *limit;
183    int32_t remainingCapacity;
184    uint8_t lastCC;
185
186    // private backward iterator
187    void setIterator() { codePointStart=limit; }
188    void skipPrevious();  // Requires start<codePointStart.
189    uint8_t previousCC();  // Returns 0 if there is no previous character.
190
191    UChar *codePointStart, *codePointLimit;
192};
193
194class U_COMMON_API Normalizer2Impl : public UMemory {
195public:
196    Normalizer2Impl() : memory(NULL), normTrie(NULL) {
197        fcdTrieSingleton.fInstance=NULL;
198        canonIterDataSingleton.fInstance=NULL;
199    }
200    ~Normalizer2Impl();
201
202    void load(const char *packageName, const char *name, UErrorCode &errorCode);
203
204    void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
205    void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
206
207    // low-level properties ------------------------------------------------ ***
208
209    const UTrie2 *getNormTrie() const { return normTrie; }
210    const UTrie2 *getFCDTrie(UErrorCode &errorCode) const ;
211
212    UBool ensureCanonIterData(UErrorCode &errorCode) const;
213
214    uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
215
216    UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
217        if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
218            return UNORM_YES;
219        } else if(minMaybeYes<=norm16) {
220            return UNORM_MAYBE;
221        } else {
222            return UNORM_NO;
223        }
224    }
225    UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
226    UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
227
228    uint8_t getCC(uint16_t norm16) const {
229        if(norm16>=MIN_NORMAL_MAYBE_YES) {
230            return (uint8_t)norm16;
231        }
232        if(norm16<minNoNo || limitNoNo<=norm16) {
233            return 0;
234        }
235        return getCCFromNoNo(norm16);
236    }
237    static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
238        return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
239    }
240
241    uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); }
242    uint16_t getFCD16FromSingleLead(UChar c) const {
243        return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c);
244    }
245    uint16_t getFCD16FromSupplementary(UChar32 c) const {
246        return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c);
247    }
248    uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const {
249        return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2));
250    }
251
252    void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
253                            UTrie2 *newFCDTrie, UErrorCode &errorCode) const;
254
255    void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
256                                     CanonIterData &newData, UErrorCode &errorCode) const;
257
258    /**
259     * Get the decomposition for one code point.
260     * @param c code point
261     * @param buffer out-only buffer for algorithmic decompositions
262     * @param length out-only, takes the length of the decomposition, if any
263     * @return pointer to the decomposition, or NULL if none
264     */
265    const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const;
266
267    UBool isCanonSegmentStarter(UChar32 c) const;
268    UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
269
270    enum {
271        MIN_CCC_LCCC_CP=0x300
272    };
273
274    enum {
275        MIN_YES_YES_WITH_CC=0xff01,
276        JAMO_VT=0xff00,
277        MIN_NORMAL_MAYBE_YES=0xfe00,
278        JAMO_L=1,
279        MAX_DELTA=0x40
280    };
281
282    enum {
283        // Byte offsets from the start of the data, after the generic header.
284        IX_NORM_TRIE_OFFSET,
285        IX_EXTRA_DATA_OFFSET,
286        IX_RESERVED2_OFFSET,
287        IX_RESERVED3_OFFSET,
288        IX_RESERVED4_OFFSET,
289        IX_RESERVED5_OFFSET,
290        IX_RESERVED6_OFFSET,
291        IX_TOTAL_SIZE,
292
293        // Code point thresholds for quick check codes.
294        IX_MIN_DECOMP_NO_CP,
295        IX_MIN_COMP_NO_MAYBE_CP,
296
297        // Norm16 value thresholds for quick check combinations and types of extra data.
298        IX_MIN_YES_NO,
299        IX_MIN_NO_NO,
300        IX_LIMIT_NO_NO,
301        IX_MIN_MAYBE_YES,
302
303        IX_RESERVED14,
304        IX_RESERVED15,
305        IX_COUNT
306    };
307
308    enum {
309        MAPPING_HAS_CCC_LCCC_WORD=0x80,
310        MAPPING_PLUS_COMPOSITION_LIST=0x40,
311        MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
312        MAPPING_LENGTH_MASK=0x1f
313    };
314
315    enum {
316        COMP_1_LAST_TUPLE=0x8000,
317        COMP_1_TRIPLE=1,
318        COMP_1_TRAIL_LIMIT=0x3400,
319        COMP_1_TRAIL_MASK=0x7ffe,
320        COMP_1_TRAIL_SHIFT=9,  // 10-1 for the "triple" bit
321        COMP_2_TRAIL_SHIFT=6,
322        COMP_2_TRAIL_MASK=0xffc0
323    };
324
325    // higher-level functionality ------------------------------------------ ***
326
327    const UChar *decompose(const UChar *src, const UChar *limit,
328                           ReorderingBuffer *buffer, UErrorCode &errorCode) const;
329    void decomposeAndAppend(const UChar *src, const UChar *limit,
330                            UBool doDecompose,
331                            ReorderingBuffer &buffer,
332                            UErrorCode &errorCode) const;
333    UBool compose(const UChar *src, const UChar *limit,
334                  UBool onlyContiguous,
335                  UBool doCompose,
336                  ReorderingBuffer &buffer,
337                  UErrorCode &errorCode) const;
338    const UChar *composeQuickCheck(const UChar *src, const UChar *limit,
339                                   UBool onlyContiguous,
340                                   UNormalizationCheckResult *pQCResult) const;
341    void composeAndAppend(const UChar *src, const UChar *limit,
342                          UBool doCompose,
343                          UBool onlyContiguous,
344                          ReorderingBuffer &buffer,
345                          UErrorCode &errorCode) const;
346    const UChar *makeFCD(const UChar *src, const UChar *limit,
347                         ReorderingBuffer *buffer, UErrorCode &errorCode) const;
348    void makeFCDAndAppend(const UChar *src, const UChar *limit,
349                          UBool doMakeFCD,
350                          ReorderingBuffer &buffer,
351                          UErrorCode &errorCode) const;
352
353    UBool hasDecompBoundary(UChar32 c, UBool before) const;
354    UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
355
356    UBool hasCompBoundaryBefore(UChar32 c) const {
357        return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
358    }
359    UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
360
361    UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
362    UBool hasFCDBoundaryAfter(UChar32 c) const {
363        uint16_t fcd16=getFCD16(c);
364        return fcd16<=1 || (fcd16&0xff)==0;
365    }
366    UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
367private:
368    static UBool U_CALLCONV
369    isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
370
371    UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
372    UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
373    static UBool isInert(uint16_t norm16) { return norm16==0; }
374    // static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
375    static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
376    UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
377    UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
378    // UBool isCompYes(uint16_t norm16) const {
379    //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
380    // }
381    // UBool isCompYesOrMaybe(uint16_t norm16) const {
382    //     return norm16<minNoNo || minMaybeYes<=norm16;
383    // }
384    // UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
385    //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
386    // }
387    UBool isDecompYesAndZeroCC(uint16_t norm16) const {
388        return norm16<minYesNo ||
389               norm16==JAMO_VT ||
390               (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
391    }
392    /**
393     * A little faster and simpler than isDecompYesAndZeroCC() but does not include
394     * the MaybeYes which combine-forward and have ccc=0.
395     * (Standard Unicode 5.2 normalization does not have such characters.)
396     */
397    UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
398        return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
399    }
400    UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
401
402    // For use with isCompYes().
403    // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
404    // static uint8_t getCCFromYes(uint16_t norm16) {
405    //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
406    // }
407    uint8_t getCCFromNoNo(uint16_t norm16) const {
408        const uint16_t *mapping=getMapping(norm16);
409        if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
410            return (uint8_t)mapping[1];
411        } else {
412            return 0;
413        }
414    }
415    // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
416    uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
417
418    // Requires algorithmic-NoNo.
419    UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
420        return c+norm16-(minMaybeYes-MAX_DELTA-1);
421    }
422
423    // Requires minYesNo<norm16<limitNoNo.
424    const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
425    const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
426        if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
427            return NULL;
428        } else if(norm16<minMaybeYes) {
429            return extraData+norm16;  // for yesYes; if Jamo L: harmless empty list
430        } else {
431            return maybeYesCompositions+norm16-minMaybeYes;
432        }
433    }
434    const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
435        const uint16_t *list=extraData+norm16;  // composite has both mapping & compositions list
436        return list+  // mapping pointer
437            1+  // +1 to skip the first unit with the mapping lenth
438            (*list&MAPPING_LENGTH_MASK)+  // + mapping length
439            ((*list>>7)&1);  // +1 if MAPPING_HAS_CCC_LCCC_WORD
440    }
441    /**
442     * @param c code point must have compositions
443     * @return compositions list pointer
444     */
445    const uint16_t *getCompositionsList(uint16_t norm16) const {
446        return isDecompYes(norm16) ?
447                getCompositionsListForDecompYes(norm16) :
448                getCompositionsListForComposite(norm16);
449    }
450
451    const UChar *copyLowPrefixFromNulTerminated(const UChar *src,
452                                                UChar32 minNeedDataCP,
453                                                ReorderingBuffer *buffer,
454                                                UErrorCode &errorCode) const;
455    UBool decomposeShort(const UChar *src, const UChar *limit,
456                         ReorderingBuffer &buffer, UErrorCode &errorCode) const;
457    UBool decompose(UChar32 c, uint16_t norm16,
458                    ReorderingBuffer &buffer, UErrorCode &errorCode) const;
459
460    static int32_t combine(const uint16_t *list, UChar32 trail);
461    void addComposites(const uint16_t *list, UnicodeSet &set) const;
462    void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
463                   UBool onlyContiguous) const;
464
465    UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
466    const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
467    const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
468
469    const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; }
470
471    const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
472    const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
473
474    int32_t getCanonValue(UChar32 c) const;
475    const UnicodeSet &getCanonStartSet(int32_t n) const;
476
477    UDataMemory *memory;
478    UVersionInfo dataVersion;
479
480    // Code point thresholds for quick check codes.
481    UChar32 minDecompNoCP;
482    UChar32 minCompNoMaybeCP;
483
484    // Norm16 value thresholds for quick check combinations and types of extra data.
485    uint16_t minYesNo;
486    uint16_t minNoNo;
487    uint16_t limitNoNo;
488    uint16_t minMaybeYes;
489
490    UTrie2 *normTrie;
491    const uint16_t *maybeYesCompositions;
492    const uint16_t *extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
493
494    SimpleSingleton fcdTrieSingleton;
495    SimpleSingleton canonIterDataSingleton;
496};
497
498// bits in canonIterData
499#define CANON_NOT_SEGMENT_STARTER 0x80000000
500#define CANON_HAS_COMPOSITIONS 0x40000000
501#define CANON_HAS_SET 0x200000
502#define CANON_VALUE_MASK 0x1fffff
503
504/**
505 * ICU-internal shortcut for quick access to standard Unicode normalization.
506 */
507class U_COMMON_API Normalizer2Factory {
508public:
509    static const Normalizer2 *getNFCInstance(UErrorCode &errorCode);
510    static const Normalizer2 *getNFDInstance(UErrorCode &errorCode);
511    static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
512    static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
513    static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode);
514    static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode);
515    static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode);
516    static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
517
518    static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
519
520    static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
521    static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
522    static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
523
524    // Get the Impl instance of the Normalizer2.
525    // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
526    static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
527
528    static const UTrie2 *getFCDTrie(UErrorCode &errorCode);
529private:
530    Normalizer2Factory();  // No instantiation.
531};
532
533U_NAMESPACE_END
534
535U_CAPI int32_t U_EXPORT2
536unorm2_swap(const UDataSwapper *ds,
537            const void *inData, int32_t length, void *outData,
538            UErrorCode *pErrorCode);
539
540/**
541 * Get the NF*_QC property for a code point, for u_getIntPropertyValue().
542 * @internal
543 */
544U_CFUNC UNormalizationCheckResult U_EXPORT2
545unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
546
547/**
548 * Internal API, used by collation code.
549 * Get access to the internal FCD trie table to be able to perform
550 * incremental, per-code unit, FCD checks in collation.
551 * One pointer is sufficient because the trie index values are offset
552 * by the index size, so that the same pointer is used to access the trie data.
553 * Code points at fcdHighStart and above have a zero FCD value.
554 * @internal
555 */
556U_CAPI const uint16_t * U_EXPORT2
557unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode);
558
559/**
560 * Internal API, used by collation code.
561 * Get the FCD value for a code unit, with
562 * bits 15..8   lead combining class
563 * bits  7..0   trail combining class
564 *
565 * If c is a lead surrogate and the value is not 0,
566 * then some of c's associated supplementary code points have a non-zero FCD value.
567 *
568 * @internal
569 */
570static inline uint16_t
571unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
572    return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
573}
574
575/**
576 * Internal API, used by collation code.
577 * Get the FCD value of the next code point (post-increment), with
578 * bits 15..8   lead combining class
579 * bits  7..0   trail combining class
580 *
581 * @internal
582 */
583static inline uint16_t
584unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
585                const UChar *&s, const UChar *limit) {
586    UChar32 c=*s++;
587    uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
588    if(fcd!=0 && U16_IS_LEAD(c)) {
589        UChar c2;
590        if(s!=limit && U16_IS_TRAIL(c2=*s)) {
591            ++s;
592            c=U16_GET_SUPPLEMENTARY(c, c2);
593            if(c<fcdHighStart) {
594                fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
595            } else {
596                fcd=0;
597            }
598        } else /* unpaired lead surrogate */ {
599            fcd=0;
600        }
601    }
602    return fcd;
603}
604
605/**
606 * Internal API, used by collation code.
607 * Get the FCD value of the previous code point (pre-decrement), with
608 * bits 15..8   lead combining class
609 * bits  7..0   trail combining class
610 *
611 * @internal
612 */
613static inline uint16_t
614unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
615                const UChar *start, const UChar *&s) {
616    UChar32 c=*--s;
617    uint16_t fcd;
618    if(!U16_IS_SURROGATE(c)) {
619        fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
620    } else {
621        UChar c2;
622        if(U16_IS_SURROGATE_TRAIL(c) && s!=start && U16_IS_LEAD(c2=*(s-1))) {
623            --s;
624            c=U16_GET_SUPPLEMENTARY(c2, c);
625            if(c<fcdHighStart) {
626                fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
627            } else {
628                fcd=0;
629            }
630        } else /* unpaired surrogate */ {
631            fcd=0;
632        }
633    }
634    return fcd;
635}
636
637/**
638 * Format of Normalizer2 .nrm data files.
639 * Format version 1.0.
640 *
641 * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
642 * ICU ships with data files for standard Unicode Normalization Forms
643 * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
644 * Custom (application-specific) data can be built into additional .nrm files
645 * with the gennorm2 build tool.
646 *
647 * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
648 * cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
649 *
650 * A .nrm file begins with a standard ICU data file header
651 * (DataHeader, see ucmndata.h and unicode/udata.h).
652 * The UDataInfo.dataVersion field usually contains the Unicode version
653 * for which the data was generated.
654 *
655 * After the header, the file contains the following parts.
656 * Constants are defined as enum values of the Normalizer2Impl class.
657 *
658 * Many details of the data structures are described in the design doc
659 * which is at http://site.icu-project.org/design/normalization/custom
660 *
661 * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
662 *
663 *      The first eight indexes are byte offsets in ascending order.
664 *      Each byte offset marks the start of the next part in the data file,
665 *      and the end of the previous one.
666 *      When two consecutive byte offsets are the same, then the corresponding part is empty.
667 *      Byte offsets are offsets from after the header,
668 *      that is, from the beginning of the indexes[].
669 *      Each part starts at an offset with proper alignment for its data.
670 *      If necessary, the previous part may include padding bytes to achieve this alignment.
671 *
672 *      minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
673 *      with a decomposition mapping, that is, with NF*D_QC=No.
674 *      minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
675 *      with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
676 *
677 *      The next four indexes are thresholds of 16-bit trie values for ranges of
678 *      values indicating multiple normalization properties.
679 *          minYesNo=indexes[IX_MIN_YES_NO];
680 *          minNoNo=indexes[IX_MIN_NO_NO];
681 *          limitNoNo=indexes[IX_LIMIT_NO_NO];
682 *          minMaybeYes=indexes[IX_MIN_MAYBE_YES];
683 *      See the normTrie description below and the design doc for details.
684 *
685 * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
686 *
687 *      The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
688 *      Rather than using independent bits in the value (which would require more than 16 bits),
689 *      information is extracted primarily via range checks.
690 *      For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
691 *      means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
692 *      which means it has a two-way (round-trip) decomposition mapping.
693 *      Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
694 *      pointing to mappings, composition lists, or both.
695 *      Value norm16==0 means that the character is normalization-inert, that is,
696 *      it does not have a mapping, does not participate in composition, has a zero
697 *      canonical combining class, and forms a boundary where text before it and after it
698 *      can be normalized independently.
699 *      For details about how multiple properties are encoded in 16-bit values
700 *      see the design doc.
701 *      Note that the encoding cannot express all combinations of the properties involved;
702 *      it only supports those combinations that are allowed by
703 *      the Unicode Normalization algorithms. Details are in the design doc as well.
704 *      The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
705 *
706 *      The trie has a value for each lead surrogate code unit representing the "worst case"
707 *      properties of the 1024 supplementary characters whose UTF-16 form starts with
708 *      the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
709 *      then their lead surrogate code unit has the trie value 0.
710 *      When the lead surrogate unit's value exceeds the quick check minimum during processing,
711 *      the properties for the full supplementary code point need to be looked up.
712 *
713 * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];
714 * uint16_t extraData[];
715 *
716 *      There is only one byte offset for the end of these two arrays.
717 *      The split between them is given by the constant and variable mentioned above.
718 *
719 *      The maybeYesCompositions array contains composition lists for characters that
720 *      combine both forward (as starters in composition pairs)
721 *      and backward (as trailing characters in composition pairs).
722 *      Such characters do not occur in Unicode 5.2 but are allowed by
723 *      the Unicode Normalization algorithms.
724 *      If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES
725 *      and the maybeYesCompositions array is empty.
726 *      If there are such characters, then minMaybeYes is subtracted from their norm16 values
727 *      to get the index into this array.
728 *
729 *      The extraData array contains composition lists for "YesYes" characters,
730 *      followed by mappings and optional composition lists for "YesNo" characters,
731 *      followed by only mappings for "NoNo" characters.
732 *      (Referring to pairs of NFC/NFD quick check values.)
733 *      The norm16 values of those characters are directly indexes into the extraData array.
734 *
735 *      The data structures for composition lists and mappings are described in the design doc.
736 */
737
738#endif  /* !UCONFIG_NO_NORMALIZATION */
739#endif  /* __NORMALIZER2IMPL_H__ */
740