1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2003-2012, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  ucmstate.c
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2003oct09
16*   created by: Markus W. Scherer
17*
18*   This file handles ICU .ucm file state information as part of the ucm module.
19*   Most of this code used to be in makeconv.c.
20*/
21
22#include "unicode/utypes.h"
23#include "cstring.h"
24#include "cmemory.h"
25#include "uarrsort.h"
26#include "ucnvmbcs.h"
27#include "ucnv_ext.h"
28#include "uparse.h"
29#include "ucm.h"
30#include <stdio.h>
31
32#if !UCONFIG_NO_CONVERSION
33
34/* MBCS state handling ------------------------------------------------------ */
35
36/*
37 * state table row grammar (ebnf-style):
38 * (whitespace is allowed between all tokens)
39 *
40 * row=[[firstentry ','] entry (',' entry)*]
41 * firstentry="initial" | "surrogates"
42 *            (initial state (default for state 0), output is all surrogate pairs)
43 * entry=range [':' nextstate] ['.' action]
44 * range=number ['-' number]
45 * nextstate=number
46 *           (0..7f)
47 * action='u' | 's' | 'p' | 'i'
48 *        (unassigned, state change only, surrogate pair, illegal)
49 * number=(1- or 2-digit hexadecimal number)
50 */
51static const char *
52parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
53    const char *t;
54    uint32_t start, end, i;
55    int32_t entry;
56
57    /* initialize the state: all illegal with U+ffff */
58    for(i=0; i<256; ++i) {
59        state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
60    }
61
62    /* skip leading white space */
63    s=u_skipWhitespace(s);
64
65    /* is there an "initial" or "surrogates" directive? */
66    if(uprv_strncmp("initial", s, 7)==0) {
67        *pFlags=MBCS_STATE_FLAG_DIRECT;
68        s=u_skipWhitespace(s+7);
69        if(*s++!=',') {
70            return s-1;
71        }
72    } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
73        *pFlags=MBCS_STATE_FLAG_SURROGATES;
74        s=u_skipWhitespace(s+10);
75        if(*s++!=',') {
76            return s-1;
77        }
78    } else if(*s==0) {
79        /* empty state row: all-illegal */
80        return NULL;
81    }
82
83    for(;;) {
84        /* read an entry, the start of the range first */
85        s=u_skipWhitespace(s);
86        start=uprv_strtoul(s, (char **)&t, 16);
87        if(s==t || 0xff<start) {
88            return s;
89        }
90        s=u_skipWhitespace(t);
91
92        /* read the end of the range if there is one */
93        if(*s=='-') {
94            s=u_skipWhitespace(s+1);
95            end=uprv_strtoul(s, (char **)&t, 16);
96            if(s==t || end<start || 0xff<end) {
97                return s;
98            }
99            s=u_skipWhitespace(t);
100        } else {
101            end=start;
102        }
103
104        /* determine the state entrys for this range */
105        if(*s!=':' && *s!='.') {
106            /* the default is: final state with valid entries */
107            entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
108        } else {
109            entry=MBCS_ENTRY_TRANSITION(0, 0);
110            if(*s==':') {
111                /* get the next state, default to 0 */
112                s=u_skipWhitespace(s+1);
113                i=uprv_strtoul(s, (char **)&t, 16);
114                if(s!=t) {
115                    if(0x7f<i) {
116                        return s;
117                    }
118                    s=u_skipWhitespace(t);
119                    entry=MBCS_ENTRY_SET_STATE(entry, i);
120                }
121            }
122
123            /* get the state action, default to valid */
124            if(*s=='.') {
125                /* this is a final state */
126                entry=MBCS_ENTRY_SET_FINAL(entry);
127
128                s=u_skipWhitespace(s+1);
129                if(*s=='u') {
130                    /* unassigned set U+fffe */
131                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
132                    s=u_skipWhitespace(s+1);
133                } else if(*s=='p') {
134                    if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
135                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
136                    } else {
137                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
138                    }
139                    s=u_skipWhitespace(s+1);
140                } else if(*s=='s') {
141                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
142                    s=u_skipWhitespace(s+1);
143                } else if(*s=='i') {
144                    /* illegal set U+ffff */
145                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
146                    s=u_skipWhitespace(s+1);
147                } else {
148                    /* default to valid */
149                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
150                }
151            } else {
152                /* this is an intermediate state, nothing to do */
153            }
154        }
155
156        /* adjust "final valid" states according to the state flags */
157        if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
158            switch(*pFlags) {
159            case 0:
160                /* no adjustment */
161                break;
162            case MBCS_STATE_FLAG_DIRECT:
163                /* set the valid-direct code point to "unassigned"==0xfffe */
164                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
165                break;
166            case MBCS_STATE_FLAG_SURROGATES:
167                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
168                break;
169            default:
170                break;
171            }
172        }
173
174        /* set this entry for the range */
175        for(i=start; i<=end; ++i) {
176            state[i]=entry;
177        }
178
179        if(*s==',') {
180            ++s;
181        } else {
182            return *s==0 ? NULL : s;
183        }
184    }
185}
186
187U_CAPI void U_EXPORT2
188ucm_addState(UCMStates *states, const char *s) {
189    const char *error;
190
191    if(states->countStates==MBCS_MAX_STATE_COUNT) {
192        fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
193        exit(U_INVALID_TABLE_FORMAT);
194    }
195
196    error=parseState(s, states->stateTable[states->countStates],
197                       &states->stateFlags[states->countStates]);
198    if(error!=NULL) {
199        fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
200        exit(U_INVALID_TABLE_FORMAT);
201    }
202
203    ++states->countStates;
204}
205
206U_CAPI UBool U_EXPORT2
207ucm_parseHeaderLine(UCMFile *ucm,
208                    char *line, char **pKey, char **pValue) {
209    UCMStates *states;
210    char *s, *end;
211    char c;
212
213    states=&ucm->states;
214
215    /* remove comments and trailing CR and LF and remove whitespace from the end */
216    for(end=line; (c=*end)!=0; ++end) {
217        if(c=='#' || c=='\r' || c=='\n') {
218            break;
219        }
220    }
221    while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
222        --end;
223    }
224    *end=0;
225
226    /* skip leading white space and ignore empty lines */
227    s=(char *)u_skipWhitespace(line);
228    if(*s==0) {
229        return TRUE;
230    }
231
232    /* stop at the beginning of the mapping section */
233    if(uprv_memcmp(s, "CHARMAP", 7)==0) {
234        return FALSE;
235    }
236
237    /* get the key name, bracketed in <> */
238    if(*s!='<') {
239        fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
240        exit(U_INVALID_TABLE_FORMAT);
241    }
242    *pKey=++s;
243    while(*s!='>') {
244        if(*s==0) {
245            fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
246            exit(U_INVALID_TABLE_FORMAT);
247        }
248        ++s;
249    }
250    *s=0;
251
252    /* get the value string, possibly quoted */
253    s=(char *)u_skipWhitespace(s+1);
254    if(*s!='"') {
255        *pValue=s;
256    } else {
257        /* remove the quotes */
258        *pValue=s+1;
259        if(end>*pValue && *(end-1)=='"') {
260            *--end=0;
261        }
262    }
263
264    /* collect the information from the header field, ignore unknown keys */
265    if(uprv_strcmp(*pKey, "uconv_class")==0) {
266        if(uprv_strcmp(*pValue, "DBCS")==0) {
267            states->conversionType=UCNV_DBCS;
268        } else if(uprv_strcmp(*pValue, "SBCS")==0) {
269            states->conversionType = UCNV_SBCS;
270        } else if(uprv_strcmp(*pValue, "MBCS")==0) {
271            states->conversionType = UCNV_MBCS;
272        } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
273            states->conversionType = UCNV_EBCDIC_STATEFUL;
274        } else {
275            fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
276            exit(U_INVALID_TABLE_FORMAT);
277        }
278        return TRUE;
279    } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
280        c=**pValue;
281        if('1'<=c && c<='4' && (*pValue)[1]==0) {
282            states->maxCharLength=(int8_t)(c-'0');
283            states->outputType=(int8_t)(states->maxCharLength-1);
284        } else {
285            fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
286            exit(U_INVALID_TABLE_FORMAT);
287        }
288        return TRUE;
289    } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
290        c=**pValue;
291        if('1'<=c && c<='4' && (*pValue)[1]==0) {
292            states->minCharLength=(int8_t)(c-'0');
293        } else {
294            fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
295            exit(U_INVALID_TABLE_FORMAT);
296        }
297        return TRUE;
298    } else if(uprv_strcmp(*pKey, "icu:state")==0) {
299        /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
300        switch(states->conversionType) {
301        case UCNV_SBCS:
302        case UCNV_DBCS:
303        case UCNV_EBCDIC_STATEFUL:
304            states->conversionType=UCNV_MBCS;
305            break;
306        case UCNV_MBCS:
307            break;
308        default:
309            fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
310            exit(U_INVALID_TABLE_FORMAT);
311        }
312
313        if(states->maxCharLength==0) {
314            fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
315            exit(U_INVALID_TABLE_FORMAT);
316        }
317        ucm_addState(states, *pValue);
318        return TRUE;
319    } else if(uprv_strcmp(*pKey, "icu:base")==0) {
320        if(**pValue==0) {
321            fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
322            exit(U_INVALID_TABLE_FORMAT);
323        }
324        uprv_strcpy(ucm->baseName, *pValue);
325        return TRUE;
326    }
327
328    return FALSE;
329}
330
331/* post-processing ---------------------------------------------------------- */
332
333static int32_t
334sumUpStates(UCMStates *states) {
335    int32_t entry, sum, state, cell, count;
336    UBool allStatesReady;
337
338    /*
339     * Sum up the offsets for all states.
340     * In each final state (where there are only final entries),
341     * the offsets add up directly.
342     * In all other state table rows, for each transition entry to another state,
343     * the offsets sum of that state needs to be added.
344     * This is achieved in at most countStates iterations.
345     */
346    allStatesReady=FALSE;
347    for(count=states->countStates; !allStatesReady && count>=0; --count) {
348        allStatesReady=TRUE;
349        for(state=states->countStates-1; state>=0; --state) {
350            if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
351                allStatesReady=FALSE;
352                sum=0;
353
354                /* at first, add up only the final delta offsets to keep them <512 */
355                for(cell=0; cell<256; ++cell) {
356                    entry=states->stateTable[state][cell];
357                    if(MBCS_ENTRY_IS_FINAL(entry)) {
358                        switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
359                        case MBCS_STATE_VALID_16:
360                            states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
361                            sum+=1;
362                            break;
363                        case MBCS_STATE_VALID_16_PAIR:
364                            states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
365                            sum+=2;
366                            break;
367                        default:
368                            /* no addition */
369                            break;
370                        }
371                    }
372                }
373
374                /* now, add up the delta offsets for the transitional entries */
375                for(cell=0; cell<256; ++cell) {
376                    entry=states->stateTable[state][cell];
377                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
378                        if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
379                            states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
380                            sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
381                        } else {
382                            /* that next state does not have a sum yet, we cannot finish the one for this state */
383                            sum=-1;
384                            break;
385                        }
386                    }
387                }
388
389                if(sum!=-1) {
390                    states->stateOffsetSum[state]=sum;
391                    states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
392                }
393            }
394        }
395    }
396
397    if(!allStatesReady) {
398        fprintf(stderr, "ucm error: the state table contains loops\n");
399        exit(U_INVALID_TABLE_FORMAT);
400    }
401
402    /*
403     * For all "direct" (i.e., initial) states>0,
404     * the offsets need to be increased by the sum of
405     * the previous initial states.
406     */
407    sum=states->stateOffsetSum[0];
408    for(state=1; state<states->countStates; ++state) {
409        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
410            int32_t sum2=sum;
411            sum+=states->stateOffsetSum[state];
412            for(cell=0; cell<256; ++cell) {
413                entry=states->stateTable[state][cell];
414                if(MBCS_ENTRY_IS_TRANSITION(entry)) {
415                    states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
416                }
417            }
418        }
419    }
420
421    /* round up to the next even number to have the following data 32-bit-aligned */
422    return states->countToUCodeUnits=(sum+1)&~1;
423}
424
425U_CAPI void U_EXPORT2
426ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
427    int32_t entry, state, cell, count;
428
429    if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
430        fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
431        exit(U_INVALID_TABLE_FORMAT);
432    }
433
434    if(states->countStates==0) {
435        switch(states->conversionType) {
436        case UCNV_SBCS:
437            /* SBCS: use MBCS data structure with a default state table */
438            if(states->maxCharLength!=1) {
439                fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
440                exit(U_INVALID_TABLE_FORMAT);
441            }
442            states->conversionType=UCNV_MBCS;
443            ucm_addState(states, "0-ff");
444            break;
445        case UCNV_MBCS:
446            fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
447            exit(U_INVALID_TABLE_FORMAT);
448            break;
449        case UCNV_EBCDIC_STATEFUL:
450            /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
451            if(states->minCharLength!=1 || states->maxCharLength!=2) {
452                fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
453                exit(U_INVALID_TABLE_FORMAT);
454            }
455            states->conversionType=UCNV_MBCS;
456            ucm_addState(states, "0-ff, e:1.s, f:0.s");
457            ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
458            ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
459            ucm_addState(states, "0-ff:1.i, 40:1.");
460            ucm_addState(states, "0-ff:1.i");
461            break;
462        case UCNV_DBCS:
463            /* DBCS: use MBCS data structure with a default state table */
464            if(states->minCharLength!=2 || states->maxCharLength!=2) {
465                fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
466                exit(U_INVALID_TABLE_FORMAT);
467            }
468            states->conversionType = UCNV_MBCS;
469            ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
470            ucm_addState(states, "41-fe");
471            ucm_addState(states, "40");
472            ucm_addState(states, "");
473            break;
474        default:
475            fprintf(stderr, "ucm error: unknown charset structure\n");
476            exit(U_INVALID_TABLE_FORMAT);
477            break;
478        }
479    }
480
481    /*
482     * check that the min/max character lengths are reasonable;
483     * to do this right, all paths through the state table would have to be
484     * recursively walked while keeping track of the sequence lengths,
485     * but these simple checks cover most state tables in practice
486     */
487    if(states->maxCharLength<states->minCharLength) {
488        fprintf(stderr, "ucm error: max B/char < min B/char\n");
489        exit(U_INVALID_TABLE_FORMAT);
490    }
491
492    /* count non-direct states and compare with max B/char */
493    count=0;
494    for(state=0; state<states->countStates; ++state) {
495        if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
496            ++count;
497        }
498    }
499    if(states->maxCharLength>count+1) {
500        fprintf(stderr, "ucm error: max B/char too large\n");
501        exit(U_INVALID_TABLE_FORMAT);
502    }
503
504    if(states->minCharLength==1) {
505        int32_t action;
506
507        /*
508         * if there are single-byte characters,
509         * then the initial state must have direct result states
510         */
511        for(cell=0; cell<256; ++cell) {
512            entry=states->stateTable[0][cell];
513            if( MBCS_ENTRY_IS_FINAL(entry) &&
514                ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
515                 action==MBCS_STATE_UNASSIGNED)
516            ) {
517                break;
518            }
519        }
520
521        if(cell==256) {
522            fprintf(stderr, "ucm warning: min B/char too small\n");
523        }
524    }
525
526    /*
527     * make sure that all "next state" values are within limits
528     * and that all next states after final ones have the "direct"
529     * flag of initial states
530     */
531    for(state=states->countStates-1; state>=0; --state) {
532        for(cell=0; cell<256; ++cell) {
533            entry=states->stateTable[state][cell];
534            if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
535                fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
536                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
537                exit(U_INVALID_TABLE_FORMAT);
538            }
539            if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
540                fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
541                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
542                exit(U_INVALID_TABLE_FORMAT);
543            } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
544                fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
545                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
546                exit(U_INVALID_TABLE_FORMAT);
547            }
548        }
549    }
550
551    /* is this an SI/SO (like EBCDIC-stateful) state table? */
552    if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
553        if(states->maxCharLength!=2) {
554            fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
555            exit(U_INVALID_TABLE_FORMAT);
556        }
557        if(states->countStates<3) {
558            fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
559            exit(U_INVALID_TABLE_FORMAT);
560        }
561        /* are the SI/SO all in the right places? */
562        if( ignoreSISOCheck ||
563           (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
564            states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
565            states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
566            states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
567        ) {
568            states->outputType=MBCS_OUTPUT_2_SISO;
569        } else {
570            fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
571            exit(U_INVALID_TABLE_FORMAT);
572        }
573        state=2;
574    } else {
575        state=1;
576    }
577
578    /* check that no unexpected state is a "direct" one */
579    while(state<states->countStates) {
580        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
581            fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
582            exit(U_INVALID_TABLE_FORMAT);
583        }
584        ++state;
585    }
586
587    sumUpStates(states);
588}
589
590/* find a fallback for this offset; return the index or -1 if not found */
591U_CAPI int32_t U_EXPORT2
592ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
593                 uint32_t offset) {
594    int32_t i;
595
596    if(countToUFallbacks==0) {
597        /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
598        return -1;
599    }
600
601    /* do a linear search for the fallback mapping (the table is not yet sorted) */
602    for(i=0; i<countToUFallbacks; ++i) {
603        if(offset==toUFallbacks[i].offset) {
604            return i;
605        }
606    }
607    return -1;
608}
609
610/*
611 * This function tries to compact toUnicode tables for 2-byte codepages
612 * by finding lead bytes with all-unassigned trail bytes and adding another state
613 * for them.
614 */
615static void
616compactToUnicode2(UCMStates *states,
617                  uint16_t **pUnicodeCodeUnits,
618                  _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
619                  UBool verbose) {
620    int32_t (*oldStateTable)[256];
621    uint16_t count[256];
622    uint16_t *oldUnicodeCodeUnits;
623    int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
624    int32_t i, j, leadState, trailState, newState, fallback;
625    uint16_t unit;
626
627    /* find the lead state */
628    if(states->outputType==MBCS_OUTPUT_2_SISO) {
629        /* use the DBCS lead state for SI/SO codepages */
630        leadState=1;
631    } else {
632        leadState=0;
633    }
634
635    /* find the main trail state: the most used target state */
636    uprv_memset(count, 0, sizeof(count));
637    for(i=0; i<256; ++i) {
638        entry=states->stateTable[leadState][i];
639        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
640            ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
641        }
642    }
643    trailState=0;
644    for(i=1; i<states->countStates; ++i) {
645        if(count[i]>count[trailState]) {
646            trailState=i;
647        }
648    }
649
650    /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
651    uprv_memset(count, 0, sizeof(count));
652    savings=0;
653    /* for each lead byte */
654    for(i=0; i<256; ++i) {
655        entry=states->stateTable[leadState][i];
656        if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
657            /* the offset is different for each lead byte */
658            offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
659            /* for each trail byte for this lead byte */
660            for(j=0; j<256; ++j) {
661                entry=states->stateTable[trailState][j];
662                switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
663                case MBCS_STATE_VALID_16:
664                    entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
665                    if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
666                        ++count[i];
667                    } else {
668                        j=999; /* do not count for this lead byte because there are assignments */
669                    }
670                    break;
671                case MBCS_STATE_VALID_16_PAIR:
672                    entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
673                    if((*pUnicodeCodeUnits)[entry]==0xfffe) {
674                        count[i]+=2;
675                    } else {
676                        j=999; /* do not count for this lead byte because there are assignments */
677                    }
678                    break;
679                default:
680                    break;
681                }
682            }
683            if(j==256) {
684                /* all trail bytes for this lead byte are unassigned */
685                savings+=count[i];
686            } else {
687                count[i]=0;
688            }
689        }
690    }
691    /* subtract from the possible savings the cost of an additional state */
692    savings=savings*2-1024; /* count bytes, not 16-bit words */
693    if(savings<=0) {
694        return;
695    }
696    if(verbose) {
697        printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
698    }
699    if(states->countStates>=MBCS_MAX_STATE_COUNT) {
700        fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
701        return;
702    }
703
704    /* make a copy of the state table */
705    oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
706    if(oldStateTable==NULL) {
707        fprintf(stderr, "cannot compact toUnicode: out of memory\n");
708        return;
709    }
710    uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
711
712    /* add the new state */
713    /*
714     * this function does not catch the degenerate case where all lead bytes
715     * have all-unassigned trail bytes and the lead state could be removed
716     */
717    newState=states->countStates++;
718    states->stateFlags[newState]=0;
719    /* copy the old trail state, turning all assigned states into unassigned ones */
720    for(i=0; i<256; ++i) {
721        entry=states->stateTable[trailState][i];
722        switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
723        case MBCS_STATE_VALID_16:
724        case MBCS_STATE_VALID_16_PAIR:
725            states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
726            break;
727        default:
728            states->stateTable[newState][i]=entry;
729            break;
730        }
731    }
732
733    /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
734    for(i=0; i<256; ++i) {
735        if(count[i]>0) {
736            states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
737        }
738    }
739
740    /* sum up the new state table */
741    for(i=0; i<states->countStates; ++i) {
742        states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
743    }
744    sum=sumUpStates(states);
745
746    /* allocate a new, smaller code units array */
747    oldUnicodeCodeUnits=*pUnicodeCodeUnits;
748    if(sum==0) {
749        *pUnicodeCodeUnits=NULL;
750        if(oldUnicodeCodeUnits!=NULL) {
751            uprv_free(oldUnicodeCodeUnits);
752        }
753        uprv_free(oldStateTable);
754        return;
755    }
756    *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
757    if(*pUnicodeCodeUnits==NULL) {
758        fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
759            (long)sum);
760        /* revert to the old state table */
761        *pUnicodeCodeUnits=oldUnicodeCodeUnits;
762        --states->countStates;
763        uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
764        uprv_free(oldStateTable);
765        return;
766    }
767    for(i=0; i<sum; ++i) {
768        (*pUnicodeCodeUnits)[i]=0xfffe;
769    }
770
771    /* copy the code units for all assigned characters */
772    /*
773     * The old state table has the same lead _and_ trail states for assigned characters!
774     * The differences are in the offsets, and in the trail states for some unassigned characters.
775     * For each character with an assigned state in the new table, it was assigned in the old one.
776     * Only still-assigned characters are copied.
777     * Note that fallback mappings need to get their offset values adjusted.
778     */
779
780    /* for each initial state */
781    for(leadState=0; leadState<states->countStates; ++leadState) {
782        if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
783            /* for each lead byte from there */
784            for(i=0; i<256; ++i) {
785                entry=states->stateTable[leadState][i];
786                if(MBCS_ENTRY_IS_TRANSITION(entry)) {
787                    trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
788                    /* the new state does not have assigned states */
789                    if(trailState!=newState) {
790                        trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
791                        oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
792                        /* for each trail byte */
793                        for(j=0; j<256; ++j) {
794                            entry=states->stateTable[trailState][j];
795                            /* copy assigned-character code units and adjust fallback offsets */
796                            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
797                            case MBCS_STATE_VALID_16:
798                                offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
799                                /* find the old offset according to the old state table */
800                                oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
801                                unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
802                                if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
803                                    toUFallbacks[fallback].offset=0x80000000|offset;
804                                }
805                                break;
806                            case MBCS_STATE_VALID_16_PAIR:
807                                offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
808                                /* find the old offset according to the old state table */
809                                oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
810                                (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
811                                (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
812                                break;
813                            default:
814                                break;
815                            }
816                        }
817                    }
818                }
819            }
820        }
821    }
822
823    /* remove temporary flags from fallback offsets that protected them from being modified twice */
824    for(i=0; i<countToUFallbacks; ++i) {
825        toUFallbacks[i].offset&=0x7fffffff;
826    }
827
828    /* free temporary memory */
829    uprv_free(oldUnicodeCodeUnits);
830    uprv_free(oldStateTable);
831}
832
833/*
834 * recursive sub-function of compactToUnicodeHelper()
835 * returns:
836 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
837 *    if all sequences from this state are unassigned, returns the
838 * <0 there are assignments in unicodeCodeUnits[]
839 * 0  no use of unicodeCodeUnits[]
840 */
841static int32_t
842findUnassigned(UCMStates *states,
843               uint16_t *unicodeCodeUnits,
844               _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
845               int32_t state, int32_t offset, uint32_t b) {
846    int32_t i, entry, savings, localSavings, belowSavings;
847    UBool haveAssigned;
848
849    localSavings=belowSavings=0;
850    haveAssigned=FALSE;
851    for(i=0; i<256; ++i) {
852        entry=states->stateTable[state][i];
853        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
854            savings=findUnassigned(states,
855                        unicodeCodeUnits,
856                        toUFallbacks, countToUFallbacks,
857                        MBCS_ENTRY_TRANSITION_STATE(entry),
858                        offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
859                        (b<<8)|(uint32_t)i);
860            if(savings<0) {
861                haveAssigned=TRUE;
862            } else if(savings>0) {
863                printf("    all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
864                    (unsigned long)((b<<8)|i), (long)state, (long)savings);
865                belowSavings+=savings;
866            }
867        } else if(!haveAssigned) {
868            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
869            case MBCS_STATE_VALID_16:
870                entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
871                if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
872                    localSavings+=2;
873                } else {
874                    haveAssigned=TRUE;
875                }
876                break;
877            case MBCS_STATE_VALID_16_PAIR:
878                entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
879                if(unicodeCodeUnits[entry]==0xfffe) {
880                    localSavings+=4;
881                } else {
882                    haveAssigned=TRUE;
883                }
884                break;
885            default:
886                break;
887            }
888        }
889    }
890    if(haveAssigned) {
891        return -1;
892    } else {
893        return localSavings+belowSavings;
894    }
895}
896
897/* helper function for finding compaction opportunities */
898static void
899compactToUnicodeHelper(UCMStates *states,
900                       uint16_t *unicodeCodeUnits,
901                       _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
902    int32_t state, savings;
903
904    /* for each initial state */
905    for(state=0; state<states->countStates; ++state) {
906        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
907            savings=findUnassigned(states,
908                        unicodeCodeUnits,
909                        toUFallbacks, countToUFallbacks,
910                        state, 0, 0);
911            if(savings>0) {
912                printf("    all-unassigned sequences from initial state %ld use %ld bytes\n",
913                    (long)state, (long)savings);
914            }
915        }
916    }
917}
918
919static int32_t
920compareFallbacks(const void *context, const void *fb1, const void *fb2) {
921    return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
922}
923
924U_CAPI void U_EXPORT2
925ucm_optimizeStates(UCMStates *states,
926                   uint16_t **pUnicodeCodeUnits,
927                   _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
928                   UBool verbose) {
929    UErrorCode errorCode;
930    int32_t state, cell, entry;
931
932    /* test each state table entry */
933    for(state=0; state<states->countStates; ++state) {
934        for(cell=0; cell<256; ++cell) {
935            entry=states->stateTable[state][cell];
936            /*
937             * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
938             * and the code point is "unassigned" (0xfffe), then change it to
939             * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
940             */
941            if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
942                states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
943            }
944        }
945    }
946
947    /* try to compact the toUnicode tables */
948    if(states->maxCharLength==2) {
949        compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
950    } else if(states->maxCharLength>2) {
951        if(verbose) {
952            compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
953        }
954    }
955
956    /* sort toUFallbacks */
957    /*
958     * It should be safe to sort them before compactToUnicode2() is called,
959     * because it should not change the relative order of the offset values
960     * that it adjusts, but they need to be sorted at some point, and
961     * it is safest here.
962     */
963    if(countToUFallbacks>0) {
964        errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
965        uprv_sortArray(toUFallbacks, countToUFallbacks,
966                       sizeof(_MBCSToUFallback),
967                       compareFallbacks, NULL, FALSE, &errorCode);
968    }
969}
970
971/* use a complete state table ----------------------------------------------- */
972
973U_CAPI int32_t U_EXPORT2
974ucm_countChars(UCMStates *states,
975               const uint8_t *bytes, int32_t length) {
976    uint32_t offset;
977    int32_t i, entry, count;
978    uint8_t state;
979
980    offset=0;
981    count=0;
982    state=0;
983
984    if(states->countStates==0) {
985        fprintf(stderr, "ucm error: there is no state information!\n");
986        return -1;
987    }
988
989    /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
990    if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
991        state=1;
992    }
993
994    /*
995     * Walk down the state table like in conversion,
996     * much like getNextUChar().
997     * We assume that c<=0x10ffff.
998     */
999    for(i=0; i<length; ++i) {
1000        entry=states->stateTable[state][bytes[i]];
1001        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1002            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1003            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1004        } else {
1005            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1006            case MBCS_STATE_ILLEGAL:
1007                fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
1008                return -1;
1009            case MBCS_STATE_CHANGE_ONLY:
1010                fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
1011                return -1;
1012            case MBCS_STATE_UNASSIGNED:
1013            case MBCS_STATE_FALLBACK_DIRECT_16:
1014            case MBCS_STATE_VALID_DIRECT_16:
1015            case MBCS_STATE_FALLBACK_DIRECT_20:
1016            case MBCS_STATE_VALID_DIRECT_20:
1017            case MBCS_STATE_VALID_16:
1018            case MBCS_STATE_VALID_16_PAIR:
1019                /* count a complete character and prepare for a new one */
1020                ++count;
1021                state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1022                offset=0;
1023                break;
1024            default:
1025                /* reserved, must never occur */
1026                fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
1027                return -1;
1028            }
1029        }
1030    }
1031
1032    if(offset!=0) {
1033        fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
1034        return -1;
1035    }
1036
1037    /*
1038     * for SI/SO (like EBCDIC-stateful), multiple-character results
1039     * must consist of only double-byte sequences
1040     */
1041    if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
1042        fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
1043        return -1;
1044    }
1045
1046    return count;
1047}
1048#endif
1049
1050