1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2003-2013, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  ucm.c
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2003jun20
16*   created by: Markus W. Scherer
17*
18*   This file reads a .ucm file, stores its mappings and sorts them.
19*   It implements handling of Unicode conversion mappings from .ucm files
20*   for makeconv, canonucm, rptp2ucm, etc.
21*
22*   Unicode code point sequences with a length of more than 1,
23*   as well as byte sequences with more than 4 bytes or more than one complete
24*   character sequence are handled to support m:n mappings.
25*/
26
27#include "unicode/utypes.h"
28#include "unicode/ustring.h"
29#include "cstring.h"
30#include "cmemory.h"
31#include "filestrm.h"
32#include "uarrsort.h"
33#include "ucnvmbcs.h"
34#include "ucnv_bld.h"
35#include "ucnv_ext.h"
36#include "uparse.h"
37#include "ucm.h"
38#include <stdio.h>
39
40#if !UCONFIG_NO_CONVERSION
41
42/* -------------------------------------------------------------------------- */
43
44static void
45printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
46    int32_t j;
47
48    for(j=0; j<m->uLen; ++j) {
49        fprintf(f, "<U%04lX>", (long)codePoints[j]);
50    }
51
52    fputc(' ', f);
53
54    for(j=0; j<m->bLen; ++j) {
55        fprintf(f, "\\x%02X", bytes[j]);
56    }
57
58    if(m->f>=0) {
59        fprintf(f, " |%u\n", m->f);
60    } else {
61        fputs("\n", f);
62    }
63}
64
65U_CAPI void U_EXPORT2
66ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
67    printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
68}
69
70U_CAPI void U_EXPORT2
71ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
72    UCMapping *m;
73    int32_t i, length;
74
75    m=table->mappings;
76    length=table->mappingsLength;
77    if(byUnicode) {
78        for(i=0; i<length; ++m, ++i) {
79            ucm_printMapping(table, m, f);
80        }
81    } else {
82        const int32_t *map=table->reverseMap;
83        for(i=0; i<length; ++i) {
84            ucm_printMapping(table, m+map[i], f);
85        }
86    }
87}
88
89/* mapping comparisons ------------------------------------------------------ */
90
91static int32_t
92compareUnicode(UCMTable *lTable, const UCMapping *l,
93               UCMTable *rTable, const UCMapping *r) {
94    const UChar32 *lu, *ru;
95    int32_t result, i, length;
96
97    if(l->uLen==1 && r->uLen==1) {
98        /* compare two single code points */
99        return l->u-r->u;
100    }
101
102    /* get pointers to the code point sequences */
103    lu=UCM_GET_CODE_POINTS(lTable, l);
104    ru=UCM_GET_CODE_POINTS(rTable, r);
105
106    /* get the minimum length */
107    if(l->uLen<=r->uLen) {
108        length=l->uLen;
109    } else {
110        length=r->uLen;
111    }
112
113    /* compare the code points */
114    for(i=0; i<length; ++i) {
115        result=lu[i]-ru[i];
116        if(result!=0) {
117            return result;
118        }
119    }
120
121    /* compare the lengths */
122    return l->uLen-r->uLen;
123}
124
125static int32_t
126compareBytes(UCMTable *lTable, const UCMapping *l,
127             UCMTable *rTable, const UCMapping *r,
128             UBool lexical) {
129    const uint8_t *lb, *rb;
130    int32_t result, i, length;
131
132    /*
133     * A lexical comparison is used for sorting in the builder, to allow
134     * an efficient search for a byte sequence that could be a prefix
135     * of a previously entered byte sequence.
136     *
137     * Comparing by lengths first is for compatibility with old .ucm tools
138     * like canonucm and rptp2ucm.
139     */
140    if(lexical) {
141        /* get the minimum length and continue */
142        if(l->bLen<=r->bLen) {
143            length=l->bLen;
144        } else {
145            length=r->bLen;
146        }
147    } else {
148        /* compare lengths first */
149        result=l->bLen-r->bLen;
150        if(result!=0) {
151            return result;
152        } else {
153            length=l->bLen;
154        }
155    }
156
157    /* get pointers to the byte sequences */
158    lb=UCM_GET_BYTES(lTable, l);
159    rb=UCM_GET_BYTES(rTable, r);
160
161    /* compare the bytes */
162    for(i=0; i<length; ++i) {
163        result=lb[i]-rb[i];
164        if(result!=0) {
165            return result;
166        }
167    }
168
169    /* compare the lengths */
170    return l->bLen-r->bLen;
171}
172
173/* compare UCMappings for sorting */
174static int32_t
175compareMappings(UCMTable *lTable, const UCMapping *l,
176                UCMTable *rTable, const UCMapping *r,
177                UBool uFirst) {
178    int32_t result;
179
180    /* choose which side to compare first */
181    if(uFirst) {
182        /* Unicode then bytes */
183        result=compareUnicode(lTable, l, rTable, r);
184        if(result==0) {
185            result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
186        }
187    } else {
188        /* bytes then Unicode */
189        result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
190        if(result==0) {
191            result=compareUnicode(lTable, l, rTable, r);
192        }
193    }
194
195    if(result!=0) {
196        return result;
197    }
198
199    /* compare the flags */
200    return l->f-r->f;
201}
202
203/* sorting by Unicode first sorts mappings directly */
204static int32_t
205compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
206    return compareMappings(
207        (UCMTable *)context, (const UCMapping *)left,
208        (UCMTable *)context, (const UCMapping *)right, TRUE);
209}
210
211/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
212static int32_t
213compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
214    UCMTable *table=(UCMTable *)context;
215    int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
216    return compareMappings(
217        table, table->mappings+l,
218        table, table->mappings+r, FALSE);
219}
220
221U_CAPI void U_EXPORT2
222ucm_sortTable(UCMTable *t) {
223    UErrorCode errorCode;
224    int32_t i;
225
226    if(t->isSorted) {
227        return;
228    }
229
230    errorCode=U_ZERO_ERROR;
231
232    /* 1. sort by Unicode first */
233    uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
234                   compareMappingsUnicodeFirst, t,
235                   FALSE, &errorCode);
236
237    /* build the reverseMap */
238    if(t->reverseMap==NULL) {
239        /*
240         * allocate mappingsCapacity instead of mappingsLength so that
241         * if mappings are added, the reverseMap need not be
242         * reallocated each time
243         * (see ucm_moveMappings() and ucm_addMapping())
244         */
245        t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
246        if(t->reverseMap==NULL) {
247            fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
248            exit(U_MEMORY_ALLOCATION_ERROR);
249        }
250    }
251    for(i=0; i<t->mappingsLength; ++i) {
252        t->reverseMap[i]=i;
253    }
254
255    /* 2. sort reverseMap by mappings bytes first */
256    uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
257                   compareMappingsBytesFirst, t,
258                   FALSE, &errorCode);
259
260    if(U_FAILURE(errorCode)) {
261        fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
262                u_errorName(errorCode));
263        exit(errorCode);
264    }
265
266    t->isSorted=TRUE;
267}
268
269/*
270 * remove mappings with their move flag set from the base table
271 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
272 */
273U_CAPI void U_EXPORT2
274ucm_moveMappings(UCMTable *base, UCMTable *ext) {
275    UCMapping *mb, *mbLimit;
276    int8_t flag;
277
278    mb=base->mappings;
279    mbLimit=mb+base->mappingsLength;
280
281    while(mb<mbLimit) {
282        flag=mb->moveFlag;
283        if(flag!=0) {
284            /* reset the move flag */
285            mb->moveFlag=0;
286
287            if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
288                /* add the mapping to the extension table */
289                ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
290            }
291
292            /* remove this mapping: move the last base mapping down and overwrite the current one */
293            if(mb<(mbLimit-1)) {
294                uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
295            }
296            --mbLimit;
297            --base->mappingsLength;
298            base->isSorted=FALSE;
299        } else {
300            ++mb;
301        }
302    }
303}
304
305enum {
306    NEEDS_MOVE=1,
307    HAS_ERRORS=2
308};
309
310static uint8_t
311checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
312                    UBool moveToExt, UBool intersectBase) {
313    UCMapping *mb, *me, *mbLimit, *meLimit;
314    int32_t cmp;
315    uint8_t result;
316
317    mb=base->mappings;
318    mbLimit=mb+base->mappingsLength;
319
320    me=ext->mappings;
321    meLimit=me+ext->mappingsLength;
322
323    result=0;
324
325    for(;;) {
326        /* skip irrelevant mappings on both sides */
327        for(;;) {
328            if(mb==mbLimit) {
329                return result;
330            }
331
332            if((0<=mb->f && mb->f<=2) || mb->f==4) {
333                break;
334            }
335
336            ++mb;
337        }
338
339        for(;;) {
340            if(me==meLimit) {
341                return result;
342            }
343
344            if((0<=me->f && me->f<=2) || me->f==4) {
345                break;
346            }
347
348            ++me;
349        }
350
351        /* compare the base and extension mappings */
352        cmp=compareUnicode(base, mb, ext, me);
353        if(cmp<0) {
354            if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
355                /*
356                 * mapping in base but not in ext, move it
357                 *
358                 * if ext is DBCS, move DBCS mappings here
359                 * and check SBCS ones for Unicode prefix below
360                 */
361                mb->moveFlag|=UCM_MOVE_TO_EXT;
362                result|=NEEDS_MOVE;
363
364            /* does mb map from an input sequence that is a prefix of me's? */
365            } else if( mb->uLen<me->uLen &&
366                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
367            ) {
368                if(moveToExt) {
369                    /* mark this mapping to be moved to the extension table */
370                    mb->moveFlag|=UCM_MOVE_TO_EXT;
371                    result|=NEEDS_MOVE;
372                } else {
373                    fprintf(stderr,
374                            "ucm error: the base table contains a mapping whose input sequence\n"
375                            "           is a prefix of the input sequence of an extension mapping\n");
376                    ucm_printMapping(base, mb, stderr);
377                    ucm_printMapping(ext, me, stderr);
378                    result|=HAS_ERRORS;
379                }
380            }
381
382            ++mb;
383        } else if(cmp==0) {
384            /*
385             * same output: remove the extension mapping,
386             * otherwise treat as an error
387             */
388            if( mb->f==me->f && mb->bLen==me->bLen &&
389                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
390            ) {
391                me->moveFlag|=UCM_REMOVE_MAPPING;
392                result|=NEEDS_MOVE;
393            } else if(intersectBase) {
394                /* mapping in base but not in ext, move it */
395                mb->moveFlag|=UCM_MOVE_TO_EXT;
396                result|=NEEDS_MOVE;
397            } else {
398                fprintf(stderr,
399                        "ucm error: the base table contains a mapping whose input sequence\n"
400                        "           is the same as the input sequence of an extension mapping\n"
401                        "           but it maps differently\n");
402                ucm_printMapping(base, mb, stderr);
403                ucm_printMapping(ext, me, stderr);
404                result|=HAS_ERRORS;
405            }
406
407            ++mb;
408        } else /* cmp>0 */ {
409            ++me;
410        }
411    }
412}
413
414static uint8_t
415checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
416                  UBool moveToExt, UBool intersectBase) {
417    UCMapping *mb, *me;
418    int32_t *baseMap, *extMap;
419    int32_t b, e, bLimit, eLimit, cmp;
420    uint8_t result;
421    UBool isSISO;
422
423    baseMap=base->reverseMap;
424    extMap=ext->reverseMap;
425
426    b=e=0;
427    bLimit=base->mappingsLength;
428    eLimit=ext->mappingsLength;
429
430    result=0;
431
432    isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
433
434    for(;;) {
435        /* skip irrelevant mappings on both sides */
436        for(;; ++b) {
437            if(b==bLimit) {
438                return result;
439            }
440            mb=base->mappings+baseMap[b];
441
442            if(intersectBase==2 && mb->bLen==1) {
443                /*
444                 * comparing a base against a DBCS extension:
445                 * leave SBCS base mappings alone
446                 */
447                continue;
448            }
449
450            if(mb->f==0 || mb->f==3) {
451                break;
452            }
453        }
454
455        for(;;) {
456            if(e==eLimit) {
457                return result;
458            }
459            me=ext->mappings+extMap[e];
460
461            if(me->f==0 || me->f==3) {
462                break;
463            }
464
465            ++e;
466        }
467
468        /* compare the base and extension mappings */
469        cmp=compareBytes(base, mb, ext, me, TRUE);
470        if(cmp<0) {
471            if(intersectBase) {
472                /* mapping in base but not in ext, move it */
473                mb->moveFlag|=UCM_MOVE_TO_EXT;
474                result|=NEEDS_MOVE;
475
476            /*
477             * does mb map from an input sequence that is a prefix of me's?
478             * for SI/SO tables, a single byte is never a prefix because it
479             * occurs in a separate single-byte state
480             */
481            } else if( mb->bLen<me->bLen &&
482                (!isSISO || mb->bLen>1) &&
483                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
484            ) {
485                if(moveToExt) {
486                    /* mark this mapping to be moved to the extension table */
487                    mb->moveFlag|=UCM_MOVE_TO_EXT;
488                    result|=NEEDS_MOVE;
489                } else {
490                    fprintf(stderr,
491                            "ucm error: the base table contains a mapping whose input sequence\n"
492                            "           is a prefix of the input sequence of an extension mapping\n");
493                    ucm_printMapping(base, mb, stderr);
494                    ucm_printMapping(ext, me, stderr);
495                    result|=HAS_ERRORS;
496                }
497            }
498
499            ++b;
500        } else if(cmp==0) {
501            /*
502             * same output: remove the extension mapping,
503             * otherwise treat as an error
504             */
505            if( mb->f==me->f && mb->uLen==me->uLen &&
506                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
507            ) {
508                me->moveFlag|=UCM_REMOVE_MAPPING;
509                result|=NEEDS_MOVE;
510            } else if(intersectBase) {
511                /* mapping in base but not in ext, move it */
512                mb->moveFlag|=UCM_MOVE_TO_EXT;
513                result|=NEEDS_MOVE;
514            } else {
515                fprintf(stderr,
516                        "ucm error: the base table contains a mapping whose input sequence\n"
517                        "           is the same as the input sequence of an extension mapping\n"
518                        "           but it maps differently\n");
519                ucm_printMapping(base, mb, stderr);
520                ucm_printMapping(ext, me, stderr);
521                result|=HAS_ERRORS;
522            }
523
524            ++b;
525        } else /* cmp>0 */ {
526            ++e;
527        }
528    }
529}
530
531U_CAPI UBool U_EXPORT2
532ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
533    UCMapping *m, *mLimit;
534    int32_t count;
535    UBool isOK;
536
537    m=table->mappings;
538    mLimit=m+table->mappingsLength;
539    isOK=TRUE;
540
541    while(m<mLimit) {
542        count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
543        if(count<1) {
544            ucm_printMapping(table, m, stderr);
545            isOK=FALSE;
546        }
547        ++m;
548    }
549
550    return isOK;
551}
552
553U_CAPI UBool U_EXPORT2
554ucm_checkBaseExt(UCMStates *baseStates,
555                 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
556                 UBool intersectBase) {
557    uint8_t result;
558
559    /* if we have an extension table, we must always use precision flags */
560    if(base->flagsType&UCM_FLAGS_IMPLICIT) {
561        fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
562        return FALSE;
563    }
564    if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
565        fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
566        return FALSE;
567    }
568
569    /* checking requires both tables to be sorted */
570    ucm_sortTable(base);
571    ucm_sortTable(ext);
572
573    /* check */
574    result=
575        checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
576        checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
577
578    if(result&HAS_ERRORS) {
579        return FALSE;
580    }
581
582    if(result&NEEDS_MOVE) {
583        ucm_moveMappings(ext, NULL);
584        ucm_moveMappings(base, moveTarget);
585        ucm_sortTable(base);
586        ucm_sortTable(ext);
587        if(moveTarget!=NULL) {
588            ucm_sortTable(moveTarget);
589        }
590    }
591
592    return TRUE;
593}
594
595/* merge tables for rptp2ucm ------------------------------------------------ */
596
597U_CAPI void U_EXPORT2
598ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
599                const uint8_t *subchar, int32_t subcharLength,
600                uint8_t subchar1) {
601    UCMapping *fromUMapping, *toUMapping;
602    int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
603
604    ucm_sortTable(fromUTable);
605    ucm_sortTable(toUTable);
606
607    fromUMapping=fromUTable->mappings;
608    toUMapping=toUTable->mappings;
609
610    fromUTop=fromUTable->mappingsLength;
611    toUTop=toUTable->mappingsLength;
612
613    fromUIndex=toUIndex=0;
614
615    while(fromUIndex<fromUTop && toUIndex<toUTop) {
616        cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
617        if(cmp==0) {
618            /* equal: roundtrip, nothing to do (flags are initially 0) */
619            ++fromUMapping;
620            ++toUMapping;
621
622            ++fromUIndex;
623            ++toUIndex;
624        } else if(cmp<0) {
625            /*
626             * the fromU mapping does not have a toU counterpart:
627             * fallback Unicode->codepage
628             */
629            if( (fromUMapping->bLen==subcharLength &&
630                 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
631                (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
632            ) {
633                fromUMapping->f=2; /* SUB mapping */
634            } else {
635                fromUMapping->f=1; /* normal fallback */
636            }
637
638            ++fromUMapping;
639            ++fromUIndex;
640        } else {
641            /*
642             * the toU mapping does not have a fromU counterpart:
643             * (reverse) fallback codepage->Unicode, copy it to the fromU table
644             */
645
646            /* ignore reverse fallbacks to Unicode SUB */
647            if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
648                toUMapping->f=3; /* reverse fallback */
649                ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
650
651                /* the table may have been reallocated */
652                fromUMapping=fromUTable->mappings+fromUIndex;
653            }
654
655            ++toUMapping;
656            ++toUIndex;
657        }
658    }
659
660    /* either one or both tables are exhausted */
661    while(fromUIndex<fromUTop) {
662        /* leftover fromU mappings are fallbacks */
663        if( (fromUMapping->bLen==subcharLength &&
664             0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
665            (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
666        ) {
667            fromUMapping->f=2; /* SUB mapping */
668        } else {
669            fromUMapping->f=1; /* normal fallback */
670        }
671
672        ++fromUMapping;
673        ++fromUIndex;
674    }
675
676    while(toUIndex<toUTop) {
677        /* leftover toU mappings are reverse fallbacks */
678
679        /* ignore reverse fallbacks to Unicode SUB */
680        if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
681            toUMapping->f=3; /* reverse fallback */
682            ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
683        }
684
685        ++toUMapping;
686        ++toUIndex;
687    }
688
689    fromUTable->isSorted=FALSE;
690}
691
692/* separate extension mappings out of base table for rptp2ucm --------------- */
693
694U_CAPI UBool U_EXPORT2
695ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
696    UCMTable *table;
697    UCMapping *m, *mLimit;
698    int32_t type;
699    UBool needsMove, isOK;
700
701    table=ucm->base;
702    m=table->mappings;
703    mLimit=m+table->mappingsLength;
704
705    needsMove=FALSE;
706    isOK=TRUE;
707
708    for(; m<mLimit; ++m) {
709        if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
710            fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
711            ucm_printMapping(table, m, stderr);
712            m->moveFlag|=UCM_REMOVE_MAPPING;
713            needsMove=TRUE;
714            continue;
715        }
716
717        type=ucm_mappingType(
718                &ucm->states, m,
719                UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
720        if(type<0) {
721            /* illegal byte sequence */
722            printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
723            isOK=FALSE;
724        } else if(type>0) {
725            m->moveFlag|=UCM_MOVE_TO_EXT;
726            needsMove=TRUE;
727        }
728    }
729
730    if(!isOK) {
731        return FALSE;
732    }
733    if(needsMove) {
734        ucm_moveMappings(ucm->base, ucm->ext);
735        return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
736    } else {
737        ucm_sortTable(ucm->base);
738        return TRUE;
739    }
740}
741
742/* ucm parser --------------------------------------------------------------- */
743
744U_CAPI int8_t U_EXPORT2
745ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
746    const char *s=*ps;
747    char *end;
748    uint8_t byte;
749    int8_t bLen;
750
751    bLen=0;
752    for(;;) {
753        /* skip an optional plus sign */
754        if(bLen>0 && *s=='+') {
755            ++s;
756        }
757        if(*s!='\\') {
758            break;
759        }
760
761        if( s[1]!='x' ||
762            (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
763        ) {
764            fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
765            return -1;
766        }
767
768        if(bLen==UCNV_EXT_MAX_BYTES) {
769            fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
770            return -1;
771        }
772        bytes[bLen++]=byte;
773        s=end;
774    }
775
776    *ps=s;
777    return bLen;
778}
779
780/* parse a mapping line; must not be empty */
781U_CAPI UBool U_EXPORT2
782ucm_parseMappingLine(UCMapping *m,
783                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
784                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
785                     const char *line) {
786    const char *s;
787    char *end;
788    UChar32 cp;
789    int32_t u16Length;
790    int8_t uLen, bLen, f;
791
792    s=line;
793    uLen=bLen=0;
794
795    /* parse code points */
796    for(;;) {
797        /* skip an optional plus sign */
798        if(uLen>0 && *s=='+') {
799            ++s;
800        }
801        if(*s!='<') {
802            break;
803        }
804
805        if( s[1]!='U' ||
806            (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
807            *end!='>'
808        ) {
809            fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
810            return FALSE;
811        }
812        if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
813            fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
814            return FALSE;
815        }
816
817        if(uLen==UCNV_EXT_MAX_UCHARS) {
818            fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
819            return FALSE;
820        }
821        codePoints[uLen++]=cp;
822        s=end+1;
823    }
824
825    if(uLen==0) {
826        fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
827        return FALSE;
828    } else if(uLen==1) {
829        m->u=codePoints[0];
830    } else {
831        UErrorCode errorCode=U_ZERO_ERROR;
832        u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
833        if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
834            u16Length>UCNV_EXT_MAX_UCHARS
835        ) {
836            fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
837            return FALSE;
838        }
839    }
840
841    s=u_skipWhitespace(s);
842
843    /* parse bytes */
844    bLen=ucm_parseBytes(bytes, line, &s);
845
846    if(bLen<0) {
847        return FALSE;
848    } else if(bLen==0) {
849        fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
850        return FALSE;
851    } else if(bLen<=4) {
852        uprv_memcpy(m->b.bytes, bytes, bLen);
853    }
854
855    /* skip everything until the fallback indicator, even the start of a comment */
856    for(;;) {
857        if(*s==0) {
858            f=-1; /* no fallback indicator */
859            break;
860        } else if(*s=='|') {
861            f=(int8_t)(s[1]-'0');
862            if((uint8_t)f>4) {
863                fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
864                return FALSE;
865            }
866            break;
867        }
868        ++s;
869    }
870
871    m->uLen=uLen;
872    m->bLen=bLen;
873    m->f=f;
874    return TRUE;
875}
876
877/* general APIs ------------------------------------------------------------- */
878
879U_CAPI UCMTable * U_EXPORT2
880ucm_openTable() {
881    UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
882    if(table==NULL) {
883        fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
884        exit(U_MEMORY_ALLOCATION_ERROR);
885    }
886
887    memset(table, 0, sizeof(UCMTable));
888    return table;
889}
890
891U_CAPI void U_EXPORT2
892ucm_closeTable(UCMTable *table) {
893    if(table!=NULL) {
894        uprv_free(table->mappings);
895        uprv_free(table->codePoints);
896        uprv_free(table->bytes);
897        uprv_free(table->reverseMap);
898        uprv_free(table);
899    }
900}
901
902U_CAPI void U_EXPORT2
903ucm_resetTable(UCMTable *table) {
904    if(table!=NULL) {
905        table->mappingsLength=0;
906        table->flagsType=0;
907        table->unicodeMask=0;
908        table->bytesLength=table->codePointsLength=0;
909        table->isSorted=FALSE;
910    }
911}
912
913U_CAPI void U_EXPORT2
914ucm_addMapping(UCMTable *table,
915               UCMapping *m,
916               UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
917               uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
918    UCMapping *tm;
919    UChar32 c;
920    int32_t idx;
921
922    if(table->mappingsLength>=table->mappingsCapacity) {
923        /* make the mappings array larger */
924        if(table->mappingsCapacity==0) {
925            table->mappingsCapacity=1000;
926        } else {
927            table->mappingsCapacity*=10;
928        }
929        table->mappings=(UCMapping *)uprv_realloc(table->mappings,
930                                             table->mappingsCapacity*sizeof(UCMapping));
931        if(table->mappings==NULL) {
932            fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
933                            (int)table->mappingsCapacity);
934            exit(U_MEMORY_ALLOCATION_ERROR);
935        }
936
937        if(table->reverseMap!=NULL) {
938            /* the reverseMap must be reallocated in a new sort */
939            uprv_free(table->reverseMap);
940            table->reverseMap=NULL;
941        }
942    }
943
944    if(m->uLen>1 && table->codePointsCapacity==0) {
945        table->codePointsCapacity=10000;
946        table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
947        if(table->codePoints==NULL) {
948            fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
949                            (int)table->codePointsCapacity);
950            exit(U_MEMORY_ALLOCATION_ERROR);
951        }
952    }
953
954    if(m->bLen>4 && table->bytesCapacity==0) {
955        table->bytesCapacity=10000;
956        table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
957        if(table->bytes==NULL) {
958            fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
959                            (int)table->bytesCapacity);
960            exit(U_MEMORY_ALLOCATION_ERROR);
961        }
962    }
963
964    if(m->uLen>1) {
965        idx=table->codePointsLength;
966        table->codePointsLength+=m->uLen;
967        if(table->codePointsLength>table->codePointsCapacity) {
968            fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
969            exit(U_MEMORY_ALLOCATION_ERROR);
970        }
971
972        uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
973        m->u=idx;
974    }
975
976    if(m->bLen>4) {
977        idx=table->bytesLength;
978        table->bytesLength+=m->bLen;
979        if(table->bytesLength>table->bytesCapacity) {
980            fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
981            exit(U_MEMORY_ALLOCATION_ERROR);
982        }
983
984        uprv_memcpy(table->bytes+idx, bytes, m->bLen);
985        m->b.idx=idx;
986    }
987
988    /* set unicodeMask */
989    for(idx=0; idx<m->uLen; ++idx) {
990        c=codePoints[idx];
991        if(c>=0x10000) {
992            table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
993        } else if(U_IS_SURROGATE(c)) {
994            table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
995        }
996    }
997
998    /* set flagsType */
999    if(m->f<0) {
1000        table->flagsType|=UCM_FLAGS_IMPLICIT;
1001    } else {
1002        table->flagsType|=UCM_FLAGS_EXPLICIT;
1003    }
1004
1005    tm=table->mappings+table->mappingsLength++;
1006    uprv_memcpy(tm, m, sizeof(UCMapping));
1007
1008    table->isSorted=FALSE;
1009}
1010
1011U_CAPI UCMFile * U_EXPORT2
1012ucm_open() {
1013    UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1014    if(ucm==NULL) {
1015        fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1016        exit(U_MEMORY_ALLOCATION_ERROR);
1017    }
1018
1019    memset(ucm, 0, sizeof(UCMFile));
1020
1021    ucm->base=ucm_openTable();
1022    ucm->ext=ucm_openTable();
1023
1024    ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1025    ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1026    ucm->states.outputType=-1;
1027    ucm->states.minCharLength=ucm->states.maxCharLength=1;
1028
1029    return ucm;
1030}
1031
1032U_CAPI void U_EXPORT2
1033ucm_close(UCMFile *ucm) {
1034    if(ucm!=NULL) {
1035        ucm_closeTable(ucm->base);
1036        ucm_closeTable(ucm->ext);
1037        uprv_free(ucm);
1038    }
1039}
1040
1041U_CAPI int32_t U_EXPORT2
1042ucm_mappingType(UCMStates *baseStates,
1043                UCMapping *m,
1044                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1045                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1046    /* check validity of the bytes and count the characters in them */
1047    int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1048    if(count<1) {
1049        /* illegal byte sequence */
1050        return -1;
1051    }
1052
1053    /*
1054     * Suitable for an ICU conversion base table means:
1055     * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1056     * - precision flag 0..3
1057     * - SBCS: any 1:1 mapping
1058     *         (the table stores additional bits to distinguish mapping types)
1059     * - MBCS: not a |2 SUB mapping for <subchar1>
1060     * - MBCS: not a |1 fallback to 0x00
1061     * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1062     *
1063     * Further restrictions for fromUnicode tables
1064     * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1065     *
1066     * All of the MBCS fromUnicode specific tests could be removed from here,
1067     * but the ones above are for unusual mappings, and removing the tests
1068     * from here would change canonucm output which seems gratuitous.
1069     * (Markus Scherer 2006-nov-28)
1070     *
1071     * Exception: All implicit mappings (f<0) that need to be moved
1072     * because of fromUnicode restrictions _must_ be moved here because
1073     * makeconv uses a hack for moving mappings only for the fromUnicode table
1074     * that only works with non-negative values of f.
1075     */
1076    if( m->uLen==1 && count==1 && m->f<=3 &&
1077        (baseStates->maxCharLength==1 ||
1078            !((m->f==2 && m->bLen==1) ||
1079              (m->f==1 && bytes[0]==0) ||
1080              (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1081    ) {
1082        return 0; /* suitable for a base table */
1083    } else {
1084        return 1; /* needs to go into an extension table */
1085    }
1086}
1087
1088U_CAPI UBool U_EXPORT2
1089ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1090                   UCMapping *m,
1091                   UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1092                   uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1093    int32_t type;
1094
1095    if(m->f==2 && m->uLen>1) {
1096        fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1097        printMapping(m, codePoints, bytes, stderr);
1098        return FALSE;
1099    }
1100
1101    if(baseStates!=NULL) {
1102        /* check validity of the bytes and count the characters in them */
1103        type=ucm_mappingType(baseStates, m, codePoints, bytes);
1104        if(type<0) {
1105            /* illegal byte sequence */
1106            printMapping(m, codePoints, bytes, stderr);
1107            return FALSE;
1108        }
1109    } else {
1110        /* not used - adding a mapping for an extension-only table before its base table is read */
1111        type=1;
1112    }
1113
1114    /*
1115     * Add the mapping to the base table if this is requested and suitable.
1116     * Otherwise, add it to the extension table.
1117     */
1118    if(forBase && type==0) {
1119        ucm_addMapping(ucm->base, m, codePoints, bytes);
1120    } else {
1121        ucm_addMapping(ucm->ext, m, codePoints, bytes);
1122    }
1123
1124    return TRUE;
1125}
1126
1127U_CAPI UBool U_EXPORT2
1128ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1129  UCMapping m={ 0, {0}, 0, 0, 0, 0 };
1130    UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1131    uint8_t bytes[UCNV_EXT_MAX_BYTES];
1132
1133    const char *s;
1134
1135    /* ignore empty and comment lines */
1136    if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1137        return TRUE;
1138    }
1139
1140    return
1141        ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1142        ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1143}
1144
1145U_CAPI void U_EXPORT2
1146ucm_readTable(UCMFile *ucm, FileStream* convFile,
1147              UBool forBase, UCMStates *baseStates,
1148              UErrorCode *pErrorCode) {
1149    char line[500];
1150    char *end;
1151    UBool isOK;
1152
1153    if(U_FAILURE(*pErrorCode)) {
1154        return;
1155    }
1156
1157    isOK=TRUE;
1158
1159    for(;;) {
1160        /* read the next line */
1161        if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1162            fprintf(stderr, "incomplete charmap section\n");
1163            isOK=FALSE;
1164            break;
1165        }
1166
1167        /* remove CR LF */
1168        end=uprv_strchr(line, 0);
1169        while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1170            --end;
1171        }
1172        *end=0;
1173
1174        /* ignore empty and comment lines */
1175        if(line[0]==0 || line[0]=='#') {
1176            continue;
1177        }
1178
1179        /* stop at the end of the mapping table */
1180        if(0==uprv_strcmp(line, "END CHARMAP")) {
1181            break;
1182        }
1183
1184        isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1185    }
1186
1187    if(!isOK) {
1188        *pErrorCode=U_INVALID_TABLE_FORMAT;
1189    }
1190}
1191#endif
1192