ucm.c revision c73f511526464f8e56c242df80552e9b0d94ae3d
11e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)/*
21e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*******************************************************************************
31e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*
41e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   Copyright (C) 2003-2013, International Business Machines
51e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   Corporation and others.  All Rights Reserved.
6f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)*
71e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*******************************************************************************
8f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)*   file name:  ucm.c
91e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   encoding:   US-ASCII
101e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   tab size:   8 (not used)
111e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   indentation:4
12f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)*
131e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   created on: 2003jun20
14f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)*   created by: Markus W. Scherer
151e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*
161e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   This file reads a .ucm file, stores its mappings and sorts them.
171e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   It implements handling of Unicode conversion mappings from .ucm files
18f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)*   for makeconv, canonucm, rptp2ucm, etc.
191e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*
201e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   Unicode code point sequences with a length of more than 1,
211e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   as well as byte sequences with more than 4 bytes or more than one complete
221e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)*   character sequence are handled to support m:n mappings.
23a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)*/
241e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
251e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include "unicode/utypes.h"
26f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "unicode/ustring.h"
271e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include "cstring.h"
28f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)#include "cmemory.h"
291e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include "filestrm.h"
305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "uarrsort.h"
311e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include "ucnvmbcs.h"
321e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include "ucnv_bld.h"
331e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include "ucnv_ext.h"
341e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include "uparse.h"
351e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include "ucm.h"
361e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#include <stdio.h>
371e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
381e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)#if !UCONFIG_NO_CONVERSION
391e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
401e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)/* -------------------------------------------------------------------------- */
411e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
421e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)static void
431e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
441e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    int32_t j;
45f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
46f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    for(j=0; j<m->uLen; ++j) {
47f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)        fprintf(f, "<U%04lX>", (long)codePoints[j]);
48f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    }
49f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
50f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    fputc(' ', f);
51f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
52f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    for(j=0; j<m->bLen; ++j) {
53f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)        fprintf(f, "\\x%02X", bytes[j]);
54f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    }
55f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
56f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    if(m->f>=0) {
57f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)        fprintf(f, " |%u\n", m->f);
58f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    } else {
59f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)        fputs("\n", f);
60f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    }
61f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}
62f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
63f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)U_CAPI void U_EXPORT2
64f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
65f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
66f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)}
67f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
68f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)U_CAPI void U_EXPORT2
69f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
70f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    UCMapping *m;
71f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    int32_t i, length;
72f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
73f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    m=table->mappings;
74f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    length=table->mappingsLength;
751e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    if(byUnicode) {
765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        for(i=0; i<length; ++m, ++i) {
7723730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)            ucm_printMapping(table, m, f);
7823730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        }
7923730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)    } else {
8023730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        const int32_t *map=table->reverseMap;
8123730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        for(i=0; i<length; ++i) {
8223730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)            ucm_printMapping(table, m+map[i], f);
8323730a6e56a168d1879203e4b3819bb36e3d8f1fTorne (Richard Coles)        }
841e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
851e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)}
861e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
871e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)/* mapping comparisons ------------------------------------------------------ */
885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
891e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)static int32_t
901e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)compareUnicode(UCMTable *lTable, const UCMapping *l,
911e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)               UCMTable *rTable, const UCMapping *r) {
921e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    const UChar32 *lu, *ru;
931e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    int32_t result, i, length;
941e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if(l->uLen==1 && r->uLen==1) {
961e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        /* compare two single code points */
971e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        return l->u-r->u;
981e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
991e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
1001e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    /* get pointers to the code point sequences */
1011e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    lu=UCM_GET_CODE_POINTS(lTable, l);
1021e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    ru=UCM_GET_CODE_POINTS(rTable, r);
1031e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
1041e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    /* get the minimum length */
1055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if(l->uLen<=r->uLen) {
1065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        length=l->uLen;
1075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    } else {
1085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        length=r->uLen;
1095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
1105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1115d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    /* compare the code points */
1125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    for(i=0; i<length; ++i) {
1135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        result=lu[i]-ru[i];
1145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        if(result!=0) {
1155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            return result;
1161e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        }
1171e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
1181e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
1191e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    /* compare the lengths */
1201e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    return l->uLen-r->uLen;
1211e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)}
1221e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
1231e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)static int32_t
1241e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)compareBytes(UCMTable *lTable, const UCMapping *l,
125116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch             UCMTable *rTable, const UCMapping *r,
126116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch             UBool lexical) {
1271e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    const uint8_t *lb, *rb;
1281e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    int32_t result, i, length;
1291e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
1301e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    /*
1311e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)     * A lexical comparison is used for sorting in the builder, to allow
1321e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)     * an efficient search for a byte sequence that could be a prefix
1331e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)     * of a previously entered byte sequence.
1341e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)     *
1351e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)     * Comparing by lengths first is for compatibility with old .ucm tools
1361e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)     * like canonucm and rptp2ucm.
137a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)     */
1385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if(lexical) {
1395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        /* get the minimum length and continue */
140a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        if(l->bLen<=r->bLen) {
1415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            length=l->bLen;
1425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        } else {
1435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            length=r->bLen;
1445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        }
145a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    } else {
146a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        /* compare lengths first */
147a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        result=l->bLen-r->bLen;
1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        if(result!=0) {
149a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)            return result;
1505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        } else {
1515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            length=l->bLen;
152a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        }
1535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
1545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
155a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    /* get pointers to the byte sequences */
156a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    lb=UCM_GET_BYTES(lTable, l);
157a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    rb=UCM_GET_BYTES(rTable, r);
158a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
159a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    /* compare the bytes */
160a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    for(i=0; i<length; ++i) {
1611e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        result=lb[i]-rb[i];
1621e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        if(result!=0) {
1635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            return result;
1645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        }
1651e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
1661e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
1675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    /* compare the lengths */
1685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return l->bLen-r->bLen;
1691e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)}
1701e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
1711e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)/* compare UCMappings for sorting */
1721e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)static int32_t
1731e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)compareMappings(UCMTable *lTable, const UCMapping *l,
1745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                UCMTable *rTable, const UCMapping *r,
1751e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                UBool uFirst) {
1761e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    int32_t result;
1771e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
1781e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    /* choose which side to compare first */
1791e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    if(uFirst) {
1801e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        /* Unicode then bytes */
1811e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        result=compareUnicode(lTable, l, rTable, r);
1821e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        if(result==0) {
1831e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
1841e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        }
1851e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    } else {
1861e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        /* bytes then Unicode */
1871e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
1881e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        if(result==0) {
1891e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            result=compareUnicode(lTable, l, rTable, r);
1901e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        }
1911e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
1925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if(result!=0) {
1945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        return result;
1955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
1965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    /* compare the flags */
1985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return l->f-r->f;
1995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
2005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)/* sorting by Unicode first sorts mappings directly */
2025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static int32_t
2035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
2045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return compareMappings(
2051e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        (UCMTable *)context, (const UCMapping *)left,
2061e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        (UCMTable *)context, (const UCMapping *)right, TRUE);
2071e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)}
2081e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2091e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
2101e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)static int32_t
2111e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
2121e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    UCMTable *table=(UCMTable *)context;
2131e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
2141e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    return compareMappings(
2151e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        table, table->mappings+l,
2165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        table, table->mappings+r, FALSE);
2171e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)}
2181e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2191e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)U_CAPI void U_EXPORT2
2201e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)ucm_sortTable(UCMTable *t) {
2211e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    UErrorCode errorCode;
2221e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    int32_t i;
2231e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2241e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    if(t->isSorted) {
2251e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        return;
2261e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
2271e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2281e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    errorCode=U_ZERO_ERROR;
2291e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    /* 1. sort by Unicode first */
2311e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
2321e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                   compareMappingsUnicodeFirst, t,
2331e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                   FALSE, &errorCode);
2341e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2351e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    /* build the reverseMap */
2361e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    if(t->reverseMap==NULL) {
2371e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        /*
2381e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)         * allocate mappingsCapacity instead of mappingsLength so that
2391e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)         * if mappings are added, the reverseMap need not be
2401e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)         * reallocated each time
2411e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)         * (see ucm_moveMappings() and ucm_addMapping())
2421e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)         */
2431e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
2441e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        if(t->reverseMap==NULL) {
2451e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
2465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            exit(U_MEMORY_ALLOCATION_ERROR);
2471e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        }
2481e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
2491e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    for(i=0; i<t->mappingsLength; ++i) {
2501e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        t->reverseMap[i]=i;
2511e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
2521e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2531e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    /* 2. sort reverseMap by mappings bytes first */
2545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
2551e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                   compareMappingsBytesFirst, t,
2561e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                   FALSE, &errorCode);
2571e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2581e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    if(U_FAILURE(errorCode)) {
2591e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
2601e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                u_errorName(errorCode));
2611e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        exit(errorCode);
2621e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    }
2635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    t->isSorted=TRUE;
2655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
2665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)/*
2681e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles) * remove mappings with their move flag set from the base table
2691e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles) * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
2701e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles) */
2711e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)U_CAPI void U_EXPORT2
2721e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)ucm_moveMappings(UCMTable *base, UCMTable *ext) {
2731e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    UCMapping *mb, *mbLimit;
2741e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    int8_t flag;
2751e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2761e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    mb=base->mappings;
2771e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    mbLimit=mb+base->mappingsLength;
2781e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
279f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    while(mb<mbLimit) {
2801e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        flag=mb->moveFlag;
2811e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        if(flag!=0) {
2821e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            /* reset the move flag */
2831e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            mb->moveFlag=0;
2841e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
2851e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
2865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                /* add the mapping to the extension table */
2875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
2885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            }
2895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
2905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            /* remove this mapping: move the last base mapping down and overwrite the current one */
2911e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            if(mb<(mbLimit-1)) {
2925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
2931e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            }
2941e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            --mbLimit;
2951e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            --base->mappingsLength;
2961e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            base->isSorted=FALSE;
2971e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        } else {
2981e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            ++mb;
2991e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        }
3005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
3011e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)}
3021e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3031e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)enum {
3041e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    NEEDS_MOVE=1,
3051e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    HAS_ERRORS=2
3061e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)};
3071e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3081e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)static uint8_t
3091e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
3101e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                    UBool moveToExt, UBool intersectBase) {
3111e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    UCMapping *mb, *me, *mbLimit, *meLimit;
3121e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    int32_t cmp;
3131e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    uint8_t result;
3141e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3151e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    mb=base->mappings;
3161e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    mbLimit=mb+base->mappingsLength;
3171e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3181e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    me=ext->mappings;
3191e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    meLimit=me+ext->mappingsLength;
3201e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3211e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    result=0;
3221e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3231e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    for(;;) {
3241e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        /* skip irrelevant mappings on both sides */
325a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        for(;;) {
326a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)            if(mb==mbLimit) {
3275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                return result;
328a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)            }
329a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
330a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)            if((0<=mb->f && mb->f<=2) || mb->f==4) {
331a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                break;
3321e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            }
3335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
3345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            ++mb;
3355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        }
3365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
3371e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        for(;;) {
3385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            if(me==meLimit) {
3391e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                return result;
3401e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            }
3411e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3421e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            if((0<=me->f && me->f<=2) || me->f==4) {
3431e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                break;
3441e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            }
3451e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3461e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            ++me;
3471e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        }
3481e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3491e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        /* compare the base and extension mappings */
3501e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        cmp=compareUnicode(base, mb, ext, me);
3511e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)        if(cmp<0) {
3521e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
3531e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                /*
3541e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                 * mapping in base but not in ext, move it
3551e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                 *
356a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                 * if ext is DBCS, move DBCS mappings here
3571e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                 * and check SBCS ones for Unicode prefix below
3581e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                 */
3591e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                mb->moveFlag|=UCM_MOVE_TO_EXT;
3601e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                result|=NEEDS_MOVE;
3611e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)
3621e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            /* does mb map from an input sequence that is a prefix of me's? */
3631e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            } else if( mb->uLen<me->uLen &&
3641e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
3651e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)            ) {
3661e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                if(moveToExt) {
3671e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                    /* mark this mapping to be moved to the extension table */
3681e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                    mb->moveFlag|=UCM_MOVE_TO_EXT;
3691e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                    result|=NEEDS_MOVE;
3701e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                } else {
3711e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)                    fprintf(stderr,
372f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                            "ucm error: the base table contains a mapping whose input sequence\n"
373f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                            "           is a prefix of the input sequence of an extension mapping\n");
374f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                    ucm_printMapping(base, mb, stderr);
375f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                    ucm_printMapping(ext, me, stderr);
376f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                    result|=HAS_ERRORS;
377f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                }
378f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            }
379f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
380f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            ++mb;
3815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        } else if(cmp==0) {
382f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            /*
383f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)             * same output: remove the extension mapping,
384f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)             * otherwise treat as an error
385f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)             */
386f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            if( mb->f==me->f && mb->bLen==me->bLen &&
387f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
388f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            ) {
389f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                me->moveFlag|=UCM_REMOVE_MAPPING;
390f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                result|=NEEDS_MOVE;
391f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            } else if(intersectBase) {
392f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                /* mapping in base but not in ext, move it */
393f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                mb->moveFlag|=UCM_MOVE_TO_EXT;
394f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                result|=NEEDS_MOVE;
395f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            } else {
396f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                fprintf(stderr,
397f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                        "ucm error: the base table contains a mapping whose input sequence\n"
398f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                        "           is the same as the input sequence of an extension mapping\n"
399f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                        "           but it maps differently\n");
400f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                ucm_printMapping(base, mb, stderr);
401f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                ucm_printMapping(ext, me, stderr);
402f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)                result|=HAS_ERRORS;
403f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            }
404f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
405f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            ++mb;
406f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)        } else /* cmp>0 */ {
407f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)            ++me;
408f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)        }
4095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    }
4105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}
411a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
4125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static uint8_t
413effb81e5f8246d0db0270817048dc992db66e9fbBen MurdochcheckBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
414effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch                  UBool moveToExt, UBool intersectBase) {
415effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch    UCMapping *mb, *me;
416f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    int32_t *baseMap, *extMap;
4175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    int32_t b, e, bLimit, eLimit, cmp;
418f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)    uint8_t result;
4191e9bf3e0803691d0a228da41fc608347b6db4340Torne (Richard Coles)    UBool isSISO;
420
421    baseMap=base->reverseMap;
422    extMap=ext->reverseMap;
423
424    b=e=0;
425    bLimit=base->mappingsLength;
426    eLimit=ext->mappingsLength;
427
428    result=0;
429
430    isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
431
432    for(;;) {
433        /* skip irrelevant mappings on both sides */
434        for(;; ++b) {
435            if(b==bLimit) {
436                return result;
437            }
438            mb=base->mappings+baseMap[b];
439
440            if(intersectBase==2 && mb->bLen==1) {
441                /*
442                 * comparing a base against a DBCS extension:
443                 * leave SBCS base mappings alone
444                 */
445                continue;
446            }
447
448            if(mb->f==0 || mb->f==3) {
449                break;
450            }
451        }
452
453        for(;;) {
454            if(e==eLimit) {
455                return result;
456            }
457            me=ext->mappings+extMap[e];
458
459            if(me->f==0 || me->f==3) {
460                break;
461            }
462
463            ++e;
464        }
465
466        /* compare the base and extension mappings */
467        cmp=compareBytes(base, mb, ext, me, TRUE);
468        if(cmp<0) {
469            if(intersectBase) {
470                /* mapping in base but not in ext, move it */
471                mb->moveFlag|=UCM_MOVE_TO_EXT;
472                result|=NEEDS_MOVE;
473
474            /*
475             * does mb map from an input sequence that is a prefix of me's?
476             * for SI/SO tables, a single byte is never a prefix because it
477             * occurs in a separate single-byte state
478             */
479            } else if( mb->bLen<me->bLen &&
480                (!isSISO || mb->bLen>1) &&
481                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
482            ) {
483                if(moveToExt) {
484                    /* mark this mapping to be moved to the extension table */
485                    mb->moveFlag|=UCM_MOVE_TO_EXT;
486                    result|=NEEDS_MOVE;
487                } else {
488                    fprintf(stderr,
489                            "ucm error: the base table contains a mapping whose input sequence\n"
490                            "           is a prefix of the input sequence of an extension mapping\n");
491                    ucm_printMapping(base, mb, stderr);
492                    ucm_printMapping(ext, me, stderr);
493                    result|=HAS_ERRORS;
494                }
495            }
496
497            ++b;
498        } else if(cmp==0) {
499            /*
500             * same output: remove the extension mapping,
501             * otherwise treat as an error
502             */
503            if( mb->f==me->f && mb->uLen==me->uLen &&
504                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
505            ) {
506                me->moveFlag|=UCM_REMOVE_MAPPING;
507                result|=NEEDS_MOVE;
508            } else if(intersectBase) {
509                /* mapping in base but not in ext, move it */
510                mb->moveFlag|=UCM_MOVE_TO_EXT;
511                result|=NEEDS_MOVE;
512            } else {
513                fprintf(stderr,
514                        "ucm error: the base table contains a mapping whose input sequence\n"
515                        "           is the same as the input sequence of an extension mapping\n"
516                        "           but it maps differently\n");
517                ucm_printMapping(base, mb, stderr);
518                ucm_printMapping(ext, me, stderr);
519                result|=HAS_ERRORS;
520            }
521
522            ++b;
523        } else /* cmp>0 */ {
524            ++e;
525        }
526    }
527}
528
529U_CAPI UBool U_EXPORT2
530ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
531    UCMapping *m, *mLimit;
532    int32_t count;
533    UBool isOK;
534
535    m=table->mappings;
536    mLimit=m+table->mappingsLength;
537    isOK=TRUE;
538
539    while(m<mLimit) {
540        count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
541        if(count<1) {
542            ucm_printMapping(table, m, stderr);
543            isOK=FALSE;
544        }
545        ++m;
546    }
547
548    return isOK;
549}
550
551U_CAPI UBool U_EXPORT2
552ucm_checkBaseExt(UCMStates *baseStates,
553                 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
554                 UBool intersectBase) {
555    uint8_t result;
556
557    /* if we have an extension table, we must always use precision flags */
558    if(base->flagsType&UCM_FLAGS_IMPLICIT) {
559        fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
560        return FALSE;
561    }
562    if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
563        fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
564        return FALSE;
565    }
566
567    /* checking requires both tables to be sorted */
568    ucm_sortTable(base);
569    ucm_sortTable(ext);
570
571    /* check */
572    result=
573        checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
574        checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
575
576    if(result&HAS_ERRORS) {
577        return FALSE;
578    }
579
580    if(result&NEEDS_MOVE) {
581        ucm_moveMappings(ext, NULL);
582        ucm_moveMappings(base, moveTarget);
583        ucm_sortTable(base);
584        ucm_sortTable(ext);
585        if(moveTarget!=NULL) {
586            ucm_sortTable(moveTarget);
587        }
588    }
589
590    return TRUE;
591}
592
593/* merge tables for rptp2ucm ------------------------------------------------ */
594
595U_CAPI void U_EXPORT2
596ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
597                const uint8_t *subchar, int32_t subcharLength,
598                uint8_t subchar1) {
599    UCMapping *fromUMapping, *toUMapping;
600    int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
601
602    ucm_sortTable(fromUTable);
603    ucm_sortTable(toUTable);
604
605    fromUMapping=fromUTable->mappings;
606    toUMapping=toUTable->mappings;
607
608    fromUTop=fromUTable->mappingsLength;
609    toUTop=toUTable->mappingsLength;
610
611    fromUIndex=toUIndex=0;
612
613    while(fromUIndex<fromUTop && toUIndex<toUTop) {
614        cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
615        if(cmp==0) {
616            /* equal: roundtrip, nothing to do (flags are initially 0) */
617            ++fromUMapping;
618            ++toUMapping;
619
620            ++fromUIndex;
621            ++toUIndex;
622        } else if(cmp<0) {
623            /*
624             * the fromU mapping does not have a toU counterpart:
625             * fallback Unicode->codepage
626             */
627            if( (fromUMapping->bLen==subcharLength &&
628                 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
629                (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
630            ) {
631                fromUMapping->f=2; /* SUB mapping */
632            } else {
633                fromUMapping->f=1; /* normal fallback */
634            }
635
636            ++fromUMapping;
637            ++fromUIndex;
638        } else {
639            /*
640             * the toU mapping does not have a fromU counterpart:
641             * (reverse) fallback codepage->Unicode, copy it to the fromU table
642             */
643
644            /* ignore reverse fallbacks to Unicode SUB */
645            if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
646                toUMapping->f=3; /* reverse fallback */
647                ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
648
649                /* the table may have been reallocated */
650                fromUMapping=fromUTable->mappings+fromUIndex;
651            }
652
653            ++toUMapping;
654            ++toUIndex;
655        }
656    }
657
658    /* either one or both tables are exhausted */
659    while(fromUIndex<fromUTop) {
660        /* leftover fromU mappings are fallbacks */
661        if( (fromUMapping->bLen==subcharLength &&
662             0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
663            (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
664        ) {
665            fromUMapping->f=2; /* SUB mapping */
666        } else {
667            fromUMapping->f=1; /* normal fallback */
668        }
669
670        ++fromUMapping;
671        ++fromUIndex;
672    }
673
674    while(toUIndex<toUTop) {
675        /* leftover toU mappings are reverse fallbacks */
676
677        /* ignore reverse fallbacks to Unicode SUB */
678        if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
679            toUMapping->f=3; /* reverse fallback */
680            ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
681        }
682
683        ++toUMapping;
684        ++toUIndex;
685    }
686
687    fromUTable->isSorted=FALSE;
688}
689
690/* separate extension mappings out of base table for rptp2ucm --------------- */
691
692U_CAPI UBool U_EXPORT2
693ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
694    UCMTable *table;
695    UCMapping *m, *mLimit;
696    int32_t type;
697    UBool needsMove, isOK;
698
699    table=ucm->base;
700    m=table->mappings;
701    mLimit=m+table->mappingsLength;
702
703    needsMove=FALSE;
704    isOK=TRUE;
705
706    for(; m<mLimit; ++m) {
707        if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
708            fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
709            ucm_printMapping(table, m, stderr);
710            m->moveFlag|=UCM_REMOVE_MAPPING;
711            needsMove=TRUE;
712            continue;
713        }
714
715        type=ucm_mappingType(
716                &ucm->states, m,
717                UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
718        if(type<0) {
719            /* illegal byte sequence */
720            printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
721            isOK=FALSE;
722        } else if(type>0) {
723            m->moveFlag|=UCM_MOVE_TO_EXT;
724            needsMove=TRUE;
725        }
726    }
727
728    if(!isOK) {
729        return FALSE;
730    }
731    if(needsMove) {
732        ucm_moveMappings(ucm->base, ucm->ext);
733        return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
734    } else {
735        ucm_sortTable(ucm->base);
736        return TRUE;
737    }
738}
739
740/* ucm parser --------------------------------------------------------------- */
741
742U_CAPI int8_t U_EXPORT2
743ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
744    const char *s=*ps;
745    char *end;
746    uint8_t byte;
747    int8_t bLen;
748
749    bLen=0;
750    for(;;) {
751        /* skip an optional plus sign */
752        if(bLen>0 && *s=='+') {
753            ++s;
754        }
755        if(*s!='\\') {
756            break;
757        }
758
759        if( s[1]!='x' ||
760            (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
761        ) {
762            fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
763            return -1;
764        }
765
766        if(bLen==UCNV_EXT_MAX_BYTES) {
767            fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
768            return -1;
769        }
770        bytes[bLen++]=byte;
771        s=end;
772    }
773
774    *ps=s;
775    return bLen;
776}
777
778/* parse a mapping line; must not be empty */
779U_CAPI UBool U_EXPORT2
780ucm_parseMappingLine(UCMapping *m,
781                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
782                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
783                     const char *line) {
784    const char *s;
785    char *end;
786    UChar32 cp;
787    int32_t u16Length;
788    int8_t uLen, bLen, f;
789
790    s=line;
791    uLen=bLen=0;
792
793    /* parse code points */
794    for(;;) {
795        /* skip an optional plus sign */
796        if(uLen>0 && *s=='+') {
797            ++s;
798        }
799        if(*s!='<') {
800            break;
801        }
802
803        if( s[1]!='U' ||
804            (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
805            *end!='>'
806        ) {
807            fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
808            return FALSE;
809        }
810        if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
811            fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
812            return FALSE;
813        }
814
815        if(uLen==UCNV_EXT_MAX_UCHARS) {
816            fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
817            return FALSE;
818        }
819        codePoints[uLen++]=cp;
820        s=end+1;
821    }
822
823    if(uLen==0) {
824        fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
825        return FALSE;
826    } else if(uLen==1) {
827        m->u=codePoints[0];
828    } else {
829        UErrorCode errorCode=U_ZERO_ERROR;
830        u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
831        if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
832            u16Length>UCNV_EXT_MAX_UCHARS
833        ) {
834            fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
835            return FALSE;
836        }
837    }
838
839    s=u_skipWhitespace(s);
840
841    /* parse bytes */
842    bLen=ucm_parseBytes(bytes, line, &s);
843
844    if(bLen<0) {
845        return FALSE;
846    } else if(bLen==0) {
847        fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
848        return FALSE;
849    } else if(bLen<=4) {
850        uprv_memcpy(m->b.bytes, bytes, bLen);
851    }
852
853    /* skip everything until the fallback indicator, even the start of a comment */
854    for(;;) {
855        if(*s==0) {
856            f=-1; /* no fallback indicator */
857            break;
858        } else if(*s=='|') {
859            f=(int8_t)(s[1]-'0');
860            if((uint8_t)f>4) {
861                fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
862                return FALSE;
863            }
864            break;
865        }
866        ++s;
867    }
868
869    m->uLen=uLen;
870    m->bLen=bLen;
871    m->f=f;
872    return TRUE;
873}
874
875/* general APIs ------------------------------------------------------------- */
876
877U_CAPI UCMTable * U_EXPORT2
878ucm_openTable() {
879    UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
880    if(table==NULL) {
881        fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
882        exit(U_MEMORY_ALLOCATION_ERROR);
883    }
884
885    memset(table, 0, sizeof(UCMTable));
886    return table;
887}
888
889U_CAPI void U_EXPORT2
890ucm_closeTable(UCMTable *table) {
891    if(table!=NULL) {
892        uprv_free(table->mappings);
893        uprv_free(table->codePoints);
894        uprv_free(table->bytes);
895        uprv_free(table->reverseMap);
896        uprv_free(table);
897    }
898}
899
900U_CAPI void U_EXPORT2
901ucm_resetTable(UCMTable *table) {
902    if(table!=NULL) {
903        table->mappingsLength=0;
904        table->flagsType=0;
905        table->unicodeMask=0;
906        table->bytesLength=table->codePointsLength=0;
907        table->isSorted=FALSE;
908    }
909}
910
911U_CAPI void U_EXPORT2
912ucm_addMapping(UCMTable *table,
913               UCMapping *m,
914               UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
915               uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
916    UCMapping *tm;
917    UChar32 c;
918    int32_t idx;
919
920    if(table->mappingsLength>=table->mappingsCapacity) {
921        /* make the mappings array larger */
922        if(table->mappingsCapacity==0) {
923            table->mappingsCapacity=1000;
924        } else {
925            table->mappingsCapacity*=10;
926        }
927        table->mappings=(UCMapping *)uprv_realloc(table->mappings,
928                                             table->mappingsCapacity*sizeof(UCMapping));
929        if(table->mappings==NULL) {
930            fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
931                            (int)table->mappingsCapacity);
932            exit(U_MEMORY_ALLOCATION_ERROR);
933        }
934
935        if(table->reverseMap!=NULL) {
936            /* the reverseMap must be reallocated in a new sort */
937            uprv_free(table->reverseMap);
938            table->reverseMap=NULL;
939        }
940    }
941
942    if(m->uLen>1 && table->codePointsCapacity==0) {
943        table->codePointsCapacity=10000;
944        table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
945        if(table->codePoints==NULL) {
946            fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
947                            (int)table->codePointsCapacity);
948            exit(U_MEMORY_ALLOCATION_ERROR);
949        }
950    }
951
952    if(m->bLen>4 && table->bytesCapacity==0) {
953        table->bytesCapacity=10000;
954        table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
955        if(table->bytes==NULL) {
956            fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
957                            (int)table->bytesCapacity);
958            exit(U_MEMORY_ALLOCATION_ERROR);
959        }
960    }
961
962    if(m->uLen>1) {
963        idx=table->codePointsLength;
964        table->codePointsLength+=m->uLen;
965        if(table->codePointsLength>table->codePointsCapacity) {
966            fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
967            exit(U_MEMORY_ALLOCATION_ERROR);
968        }
969
970        uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4);
971        m->u=idx;
972    }
973
974    if(m->bLen>4) {
975        idx=table->bytesLength;
976        table->bytesLength+=m->bLen;
977        if(table->bytesLength>table->bytesCapacity) {
978            fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
979            exit(U_MEMORY_ALLOCATION_ERROR);
980        }
981
982        uprv_memcpy(table->bytes+idx, bytes, m->bLen);
983        m->b.idx=idx;
984    }
985
986    /* set unicodeMask */
987    for(idx=0; idx<m->uLen; ++idx) {
988        c=codePoints[idx];
989        if(c>=0x10000) {
990            table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
991        } else if(U_IS_SURROGATE(c)) {
992            table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
993        }
994    }
995
996    /* set flagsType */
997    if(m->f<0) {
998        table->flagsType|=UCM_FLAGS_IMPLICIT;
999    } else {
1000        table->flagsType|=UCM_FLAGS_EXPLICIT;
1001    }
1002
1003    tm=table->mappings+table->mappingsLength++;
1004    uprv_memcpy(tm, m, sizeof(UCMapping));
1005
1006    table->isSorted=FALSE;
1007}
1008
1009U_CAPI UCMFile * U_EXPORT2
1010ucm_open() {
1011    UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1012    if(ucm==NULL) {
1013        fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1014        exit(U_MEMORY_ALLOCATION_ERROR);
1015    }
1016
1017    memset(ucm, 0, sizeof(UCMFile));
1018
1019    ucm->base=ucm_openTable();
1020    ucm->ext=ucm_openTable();
1021
1022    ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1023    ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1024    ucm->states.outputType=-1;
1025    ucm->states.minCharLength=ucm->states.maxCharLength=1;
1026
1027    return ucm;
1028}
1029
1030U_CAPI void U_EXPORT2
1031ucm_close(UCMFile *ucm) {
1032    if(ucm!=NULL) {
1033        ucm_closeTable(ucm->base);
1034        ucm_closeTable(ucm->ext);
1035        uprv_free(ucm);
1036    }
1037}
1038
1039U_CAPI int32_t U_EXPORT2
1040ucm_mappingType(UCMStates *baseStates,
1041                UCMapping *m,
1042                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1043                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1044    /* check validity of the bytes and count the characters in them */
1045    int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1046    if(count<1) {
1047        /* illegal byte sequence */
1048        return -1;
1049    }
1050
1051    /*
1052     * Suitable for an ICU conversion base table means:
1053     * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1054     * - precision flag 0..3
1055     * - SBCS: any 1:1 mapping
1056     *         (the table stores additional bits to distinguish mapping types)
1057     * - MBCS: not a |2 SUB mapping for <subchar1>
1058     * - MBCS: not a |1 fallback to 0x00
1059     * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1060     *
1061     * Further restrictions for fromUnicode tables
1062     * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1063     *
1064     * All of the MBCS fromUnicode specific tests could be removed from here,
1065     * but the ones above are for unusual mappings, and removing the tests
1066     * from here would change canonucm output which seems gratuitous.
1067     * (Markus Scherer 2006-nov-28)
1068     *
1069     * Exception: All implicit mappings (f<0) that need to be moved
1070     * because of fromUnicode restrictions _must_ be moved here because
1071     * makeconv uses a hack for moving mappings only for the fromUnicode table
1072     * that only works with non-negative values of f.
1073     */
1074    if( m->uLen==1 && count==1 && m->f<=3 &&
1075        (baseStates->maxCharLength==1 ||
1076            !((m->f==2 && m->bLen==1) ||
1077              (m->f==1 && bytes[0]==0) ||
1078              (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1079    ) {
1080        return 0; /* suitable for a base table */
1081    } else {
1082        return 1; /* needs to go into an extension table */
1083    }
1084}
1085
1086U_CAPI UBool U_EXPORT2
1087ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1088                   UCMapping *m,
1089                   UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1090                   uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1091    int32_t type;
1092
1093    if(m->f==2 && m->uLen>1) {
1094        fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1095        printMapping(m, codePoints, bytes, stderr);
1096        return FALSE;
1097    }
1098
1099    if(baseStates!=NULL) {
1100        /* check validity of the bytes and count the characters in them */
1101        type=ucm_mappingType(baseStates, m, codePoints, bytes);
1102        if(type<0) {
1103            /* illegal byte sequence */
1104            printMapping(m, codePoints, bytes, stderr);
1105            return FALSE;
1106        }
1107    } else {
1108        /* not used - adding a mapping for an extension-only table before its base table is read */
1109        type=1;
1110    }
1111
1112    /*
1113     * Add the mapping to the base table if this is requested and suitable.
1114     * Otherwise, add it to the extension table.
1115     */
1116    if(forBase && type==0) {
1117        ucm_addMapping(ucm->base, m, codePoints, bytes);
1118    } else {
1119        ucm_addMapping(ucm->ext, m, codePoints, bytes);
1120    }
1121
1122    return TRUE;
1123}
1124
1125U_CAPI UBool U_EXPORT2
1126ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1127    UCMapping m={ 0 };
1128    UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1129    uint8_t bytes[UCNV_EXT_MAX_BYTES];
1130
1131    const char *s;
1132
1133    /* ignore empty and comment lines */
1134    if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1135        return TRUE;
1136    }
1137
1138    return
1139        ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1140        ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1141}
1142
1143U_CAPI void U_EXPORT2
1144ucm_readTable(UCMFile *ucm, FileStream* convFile,
1145              UBool forBase, UCMStates *baseStates,
1146              UErrorCode *pErrorCode) {
1147    char line[500];
1148    char *end;
1149    UBool isOK;
1150
1151    if(U_FAILURE(*pErrorCode)) {
1152        return;
1153    }
1154
1155    isOK=TRUE;
1156
1157    for(;;) {
1158        /* read the next line */
1159        if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1160            fprintf(stderr, "incomplete charmap section\n");
1161            isOK=FALSE;
1162            break;
1163        }
1164
1165        /* remove CR LF */
1166        end=uprv_strchr(line, 0);
1167        while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1168            --end;
1169        }
1170        *end=0;
1171
1172        /* ignore empty and comment lines */
1173        if(line[0]==0 || line[0]=='#') {
1174            continue;
1175        }
1176
1177        /* stop at the end of the mapping table */
1178        if(0==uprv_strcmp(line, "END CHARMAP")) {
1179            break;
1180        }
1181
1182        isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1183    }
1184
1185    if(!isOK) {
1186        *pErrorCode=U_INVALID_TABLE_FORMAT;
1187    }
1188}
1189#endif
1190