1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003-2012, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucmstate.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003oct09
14*   created by: Markus W. Scherer
15*
16*   This file handles ICU .ucm file state information as part of the ucm module.
17*   Most of this code used to be in makeconv.c.
18*/
19
20#include "unicode/utypes.h"
21#include "cstring.h"
22#include "cmemory.h"
23#include "uarrsort.h"
24#include "ucnvmbcs.h"
25#include "ucnv_ext.h"
26#include "uparse.h"
27#include "ucm.h"
28#include <stdio.h>
29
30#if !UCONFIG_NO_CONVERSION
31
32/* MBCS state handling ------------------------------------------------------ */
33
34/*
35 * state table row grammar (ebnf-style):
36 * (whitespace is allowed between all tokens)
37 *
38 * row=[[firstentry ','] entry (',' entry)*]
39 * firstentry="initial" | "surrogates"
40 *            (initial state (default for state 0), output is all surrogate pairs)
41 * entry=range [':' nextstate] ['.' action]
42 * range=number ['-' number]
43 * nextstate=number
44 *           (0..7f)
45 * action='u' | 's' | 'p' | 'i'
46 *        (unassigned, state change only, surrogate pair, illegal)
47 * number=(1- or 2-digit hexadecimal number)
48 */
49static const char *
50parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
51    const char *t;
52    uint32_t start, end, i;
53    int32_t entry;
54
55    /* initialize the state: all illegal with U+ffff */
56    for(i=0; i<256; ++i) {
57        state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
58    }
59
60    /* skip leading white space */
61    s=u_skipWhitespace(s);
62
63    /* is there an "initial" or "surrogates" directive? */
64    if(uprv_strncmp("initial", s, 7)==0) {
65        *pFlags=MBCS_STATE_FLAG_DIRECT;
66        s=u_skipWhitespace(s+7);
67        if(*s++!=',') {
68            return s-1;
69        }
70    } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
71        *pFlags=MBCS_STATE_FLAG_SURROGATES;
72        s=u_skipWhitespace(s+10);
73        if(*s++!=',') {
74            return s-1;
75        }
76    } else if(*s==0) {
77        /* empty state row: all-illegal */
78        return NULL;
79    }
80
81    for(;;) {
82        /* read an entry, the start of the range first */
83        s=u_skipWhitespace(s);
84        start=uprv_strtoul(s, (char **)&t, 16);
85        if(s==t || 0xff<start) {
86            return s;
87        }
88        s=u_skipWhitespace(t);
89
90        /* read the end of the range if there is one */
91        if(*s=='-') {
92            s=u_skipWhitespace(s+1);
93            end=uprv_strtoul(s, (char **)&t, 16);
94            if(s==t || end<start || 0xff<end) {
95                return s;
96            }
97            s=u_skipWhitespace(t);
98        } else {
99            end=start;
100        }
101
102        /* determine the state entrys for this range */
103        if(*s!=':' && *s!='.') {
104            /* the default is: final state with valid entries */
105            entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
106        } else {
107            entry=MBCS_ENTRY_TRANSITION(0, 0);
108            if(*s==':') {
109                /* get the next state, default to 0 */
110                s=u_skipWhitespace(s+1);
111                i=uprv_strtoul(s, (char **)&t, 16);
112                if(s!=t) {
113                    if(0x7f<i) {
114                        return s;
115                    }
116                    s=u_skipWhitespace(t);
117                    entry=MBCS_ENTRY_SET_STATE(entry, i);
118                }
119            }
120
121            /* get the state action, default to valid */
122            if(*s=='.') {
123                /* this is a final state */
124                entry=MBCS_ENTRY_SET_FINAL(entry);
125
126                s=u_skipWhitespace(s+1);
127                if(*s=='u') {
128                    /* unassigned set U+fffe */
129                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
130                    s=u_skipWhitespace(s+1);
131                } else if(*s=='p') {
132                    if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
133                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
134                    } else {
135                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
136                    }
137                    s=u_skipWhitespace(s+1);
138                } else if(*s=='s') {
139                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
140                    s=u_skipWhitespace(s+1);
141                } else if(*s=='i') {
142                    /* illegal set U+ffff */
143                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
144                    s=u_skipWhitespace(s+1);
145                } else {
146                    /* default to valid */
147                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
148                }
149            } else {
150                /* this is an intermediate state, nothing to do */
151            }
152        }
153
154        /* adjust "final valid" states according to the state flags */
155        if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
156            switch(*pFlags) {
157            case 0:
158                /* no adjustment */
159                break;
160            case MBCS_STATE_FLAG_DIRECT:
161                /* set the valid-direct code point to "unassigned"==0xfffe */
162                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
163                break;
164            case MBCS_STATE_FLAG_SURROGATES:
165                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
166                break;
167            default:
168                break;
169            }
170        }
171
172        /* set this entry for the range */
173        for(i=start; i<=end; ++i) {
174            state[i]=entry;
175        }
176
177        if(*s==',') {
178            ++s;
179        } else {
180            return *s==0 ? NULL : s;
181        }
182    }
183}
184
185U_CAPI void U_EXPORT2
186ucm_addState(UCMStates *states, const char *s) {
187    const char *error;
188
189    if(states->countStates==MBCS_MAX_STATE_COUNT) {
190        fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
191        exit(U_INVALID_TABLE_FORMAT);
192    }
193
194    error=parseState(s, states->stateTable[states->countStates],
195                       &states->stateFlags[states->countStates]);
196    if(error!=NULL) {
197        fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
198        exit(U_INVALID_TABLE_FORMAT);
199    }
200
201    ++states->countStates;
202}
203
204U_CAPI UBool U_EXPORT2
205ucm_parseHeaderLine(UCMFile *ucm,
206                    char *line, char **pKey, char **pValue) {
207    UCMStates *states;
208    char *s, *end;
209    char c;
210
211    states=&ucm->states;
212
213    /* remove comments and trailing CR and LF and remove whitespace from the end */
214    for(end=line; (c=*end)!=0; ++end) {
215        if(c=='#' || c=='\r' || c=='\n') {
216            break;
217        }
218    }
219    while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
220        --end;
221    }
222    *end=0;
223
224    /* skip leading white space and ignore empty lines */
225    s=(char *)u_skipWhitespace(line);
226    if(*s==0) {
227        return TRUE;
228    }
229
230    /* stop at the beginning of the mapping section */
231    if(uprv_memcmp(s, "CHARMAP", 7)==0) {
232        return FALSE;
233    }
234
235    /* get the key name, bracketed in <> */
236    if(*s!='<') {
237        fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
238        exit(U_INVALID_TABLE_FORMAT);
239    }
240    *pKey=++s;
241    while(*s!='>') {
242        if(*s==0) {
243            fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
244            exit(U_INVALID_TABLE_FORMAT);
245        }
246        ++s;
247    }
248    *s=0;
249
250    /* get the value string, possibly quoted */
251    s=(char *)u_skipWhitespace(s+1);
252    if(*s!='"') {
253        *pValue=s;
254    } else {
255        /* remove the quotes */
256        *pValue=s+1;
257        if(end>*pValue && *(end-1)=='"') {
258            *--end=0;
259        }
260    }
261
262    /* collect the information from the header field, ignore unknown keys */
263    if(uprv_strcmp(*pKey, "uconv_class")==0) {
264        if(uprv_strcmp(*pValue, "DBCS")==0) {
265            states->conversionType=UCNV_DBCS;
266        } else if(uprv_strcmp(*pValue, "SBCS")==0) {
267            states->conversionType = UCNV_SBCS;
268        } else if(uprv_strcmp(*pValue, "MBCS")==0) {
269            states->conversionType = UCNV_MBCS;
270        } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
271            states->conversionType = UCNV_EBCDIC_STATEFUL;
272        } else {
273            fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
274            exit(U_INVALID_TABLE_FORMAT);
275        }
276        return TRUE;
277    } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
278        c=**pValue;
279        if('1'<=c && c<='4' && (*pValue)[1]==0) {
280            states->maxCharLength=(int8_t)(c-'0');
281            states->outputType=(int8_t)(states->maxCharLength-1);
282        } else {
283            fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
284            exit(U_INVALID_TABLE_FORMAT);
285        }
286        return TRUE;
287    } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
288        c=**pValue;
289        if('1'<=c && c<='4' && (*pValue)[1]==0) {
290            states->minCharLength=(int8_t)(c-'0');
291        } else {
292            fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
293            exit(U_INVALID_TABLE_FORMAT);
294        }
295        return TRUE;
296    } else if(uprv_strcmp(*pKey, "icu:state")==0) {
297        /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
298        switch(states->conversionType) {
299        case UCNV_SBCS:
300        case UCNV_DBCS:
301        case UCNV_EBCDIC_STATEFUL:
302            states->conversionType=UCNV_MBCS;
303            break;
304        case UCNV_MBCS:
305            break;
306        default:
307            fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
308            exit(U_INVALID_TABLE_FORMAT);
309        }
310
311        if(states->maxCharLength==0) {
312            fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
313            exit(U_INVALID_TABLE_FORMAT);
314        }
315        ucm_addState(states, *pValue);
316        return TRUE;
317    } else if(uprv_strcmp(*pKey, "icu:base")==0) {
318        if(**pValue==0) {
319            fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
320            exit(U_INVALID_TABLE_FORMAT);
321        }
322        uprv_strcpy(ucm->baseName, *pValue);
323        return TRUE;
324    }
325
326    return FALSE;
327}
328
329/* post-processing ---------------------------------------------------------- */
330
331static int32_t
332sumUpStates(UCMStates *states) {
333    int32_t entry, sum, state, cell, count;
334    UBool allStatesReady;
335
336    /*
337     * Sum up the offsets for all states.
338     * In each final state (where there are only final entries),
339     * the offsets add up directly.
340     * In all other state table rows, for each transition entry to another state,
341     * the offsets sum of that state needs to be added.
342     * This is achieved in at most countStates iterations.
343     */
344    allStatesReady=FALSE;
345    for(count=states->countStates; !allStatesReady && count>=0; --count) {
346        allStatesReady=TRUE;
347        for(state=states->countStates-1; state>=0; --state) {
348            if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
349                allStatesReady=FALSE;
350                sum=0;
351
352                /* at first, add up only the final delta offsets to keep them <512 */
353                for(cell=0; cell<256; ++cell) {
354                    entry=states->stateTable[state][cell];
355                    if(MBCS_ENTRY_IS_FINAL(entry)) {
356                        switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
357                        case MBCS_STATE_VALID_16:
358                            states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
359                            sum+=1;
360                            break;
361                        case MBCS_STATE_VALID_16_PAIR:
362                            states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
363                            sum+=2;
364                            break;
365                        default:
366                            /* no addition */
367                            break;
368                        }
369                    }
370                }
371
372                /* now, add up the delta offsets for the transitional entries */
373                for(cell=0; cell<256; ++cell) {
374                    entry=states->stateTable[state][cell];
375                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
376                        if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
377                            states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
378                            sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
379                        } else {
380                            /* that next state does not have a sum yet, we cannot finish the one for this state */
381                            sum=-1;
382                            break;
383                        }
384                    }
385                }
386
387                if(sum!=-1) {
388                    states->stateOffsetSum[state]=sum;
389                    states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
390                }
391            }
392        }
393    }
394
395    if(!allStatesReady) {
396        fprintf(stderr, "ucm error: the state table contains loops\n");
397        exit(U_INVALID_TABLE_FORMAT);
398    }
399
400    /*
401     * For all "direct" (i.e., initial) states>0,
402     * the offsets need to be increased by the sum of
403     * the previous initial states.
404     */
405    sum=states->stateOffsetSum[0];
406    for(state=1; state<states->countStates; ++state) {
407        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
408            int32_t sum2=sum;
409            sum+=states->stateOffsetSum[state];
410            for(cell=0; cell<256; ++cell) {
411                entry=states->stateTable[state][cell];
412                if(MBCS_ENTRY_IS_TRANSITION(entry)) {
413                    states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
414                }
415            }
416        }
417    }
418
419    /* round up to the next even number to have the following data 32-bit-aligned */
420    return states->countToUCodeUnits=(sum+1)&~1;
421}
422
423U_CAPI void U_EXPORT2
424ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
425    int32_t entry, state, cell, count;
426
427    if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
428        fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
429        exit(U_INVALID_TABLE_FORMAT);
430    }
431
432    if(states->countStates==0) {
433        switch(states->conversionType) {
434        case UCNV_SBCS:
435            /* SBCS: use MBCS data structure with a default state table */
436            if(states->maxCharLength!=1) {
437                fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
438                exit(U_INVALID_TABLE_FORMAT);
439            }
440            states->conversionType=UCNV_MBCS;
441            ucm_addState(states, "0-ff");
442            break;
443        case UCNV_MBCS:
444            fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
445            exit(U_INVALID_TABLE_FORMAT);
446            break;
447        case UCNV_EBCDIC_STATEFUL:
448            /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
449            if(states->minCharLength!=1 || states->maxCharLength!=2) {
450                fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
451                exit(U_INVALID_TABLE_FORMAT);
452            }
453            states->conversionType=UCNV_MBCS;
454            ucm_addState(states, "0-ff, e:1.s, f:0.s");
455            ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
456            ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
457            ucm_addState(states, "0-ff:1.i, 40:1.");
458            ucm_addState(states, "0-ff:1.i");
459            break;
460        case UCNV_DBCS:
461            /* DBCS: use MBCS data structure with a default state table */
462            if(states->minCharLength!=2 || states->maxCharLength!=2) {
463                fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
464                exit(U_INVALID_TABLE_FORMAT);
465            }
466            states->conversionType = UCNV_MBCS;
467            ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
468            ucm_addState(states, "41-fe");
469            ucm_addState(states, "40");
470            ucm_addState(states, "");
471            break;
472        default:
473            fprintf(stderr, "ucm error: unknown charset structure\n");
474            exit(U_INVALID_TABLE_FORMAT);
475            break;
476        }
477    }
478
479    /*
480     * check that the min/max character lengths are reasonable;
481     * to do this right, all paths through the state table would have to be
482     * recursively walked while keeping track of the sequence lengths,
483     * but these simple checks cover most state tables in practice
484     */
485    if(states->maxCharLength<states->minCharLength) {
486        fprintf(stderr, "ucm error: max B/char < min B/char\n");
487        exit(U_INVALID_TABLE_FORMAT);
488    }
489
490    /* count non-direct states and compare with max B/char */
491    count=0;
492    for(state=0; state<states->countStates; ++state) {
493        if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
494            ++count;
495        }
496    }
497    if(states->maxCharLength>count+1) {
498        fprintf(stderr, "ucm error: max B/char too large\n");
499        exit(U_INVALID_TABLE_FORMAT);
500    }
501
502    if(states->minCharLength==1) {
503        int32_t action;
504
505        /*
506         * if there are single-byte characters,
507         * then the initial state must have direct result states
508         */
509        for(cell=0; cell<256; ++cell) {
510            entry=states->stateTable[0][cell];
511            if( MBCS_ENTRY_IS_FINAL(entry) &&
512                ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
513                 action==MBCS_STATE_UNASSIGNED)
514            ) {
515                break;
516            }
517        }
518
519        if(cell==256) {
520            fprintf(stderr, "ucm warning: min B/char too small\n");
521        }
522    }
523
524    /*
525     * make sure that all "next state" values are within limits
526     * and that all next states after final ones have the "direct"
527     * flag of initial states
528     */
529    for(state=states->countStates-1; state>=0; --state) {
530        for(cell=0; cell<256; ++cell) {
531            entry=states->stateTable[state][cell];
532            if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
533                fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
534                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
535                exit(U_INVALID_TABLE_FORMAT);
536            }
537            if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
538                fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
539                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
540                exit(U_INVALID_TABLE_FORMAT);
541            } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
542                fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
543                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
544                exit(U_INVALID_TABLE_FORMAT);
545            }
546        }
547    }
548
549    /* is this an SI/SO (like EBCDIC-stateful) state table? */
550    if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
551        if(states->maxCharLength!=2) {
552            fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
553            exit(U_INVALID_TABLE_FORMAT);
554        }
555        if(states->countStates<3) {
556            fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
557            exit(U_INVALID_TABLE_FORMAT);
558        }
559        /* are the SI/SO all in the right places? */
560        if( ignoreSISOCheck ||
561           (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
562            states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
563            states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
564            states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
565        ) {
566            states->outputType=MBCS_OUTPUT_2_SISO;
567        } else {
568            fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
569            exit(U_INVALID_TABLE_FORMAT);
570        }
571        state=2;
572    } else {
573        state=1;
574    }
575
576    /* check that no unexpected state is a "direct" one */
577    while(state<states->countStates) {
578        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
579            fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
580            exit(U_INVALID_TABLE_FORMAT);
581        }
582        ++state;
583    }
584
585    sumUpStates(states);
586}
587
588/* find a fallback for this offset; return the index or -1 if not found */
589U_CAPI int32_t U_EXPORT2
590ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
591                 uint32_t offset) {
592    int32_t i;
593
594    if(countToUFallbacks==0) {
595        /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
596        return -1;
597    }
598
599    /* do a linear search for the fallback mapping (the table is not yet sorted) */
600    for(i=0; i<countToUFallbacks; ++i) {
601        if(offset==toUFallbacks[i].offset) {
602            return i;
603        }
604    }
605    return -1;
606}
607
608/*
609 * This function tries to compact toUnicode tables for 2-byte codepages
610 * by finding lead bytes with all-unassigned trail bytes and adding another state
611 * for them.
612 */
613static void
614compactToUnicode2(UCMStates *states,
615                  uint16_t **pUnicodeCodeUnits,
616                  _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
617                  UBool verbose) {
618    int32_t (*oldStateTable)[256];
619    uint16_t count[256];
620    uint16_t *oldUnicodeCodeUnits;
621    int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
622    int32_t i, j, leadState, trailState, newState, fallback;
623    uint16_t unit;
624
625    /* find the lead state */
626    if(states->outputType==MBCS_OUTPUT_2_SISO) {
627        /* use the DBCS lead state for SI/SO codepages */
628        leadState=1;
629    } else {
630        leadState=0;
631    }
632
633    /* find the main trail state: the most used target state */
634    uprv_memset(count, 0, sizeof(count));
635    for(i=0; i<256; ++i) {
636        entry=states->stateTable[leadState][i];
637        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
638            ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
639        }
640    }
641    trailState=0;
642    for(i=1; i<states->countStates; ++i) {
643        if(count[i]>count[trailState]) {
644            trailState=i;
645        }
646    }
647
648    /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
649    uprv_memset(count, 0, sizeof(count));
650    savings=0;
651    /* for each lead byte */
652    for(i=0; i<256; ++i) {
653        entry=states->stateTable[leadState][i];
654        if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
655            /* the offset is different for each lead byte */
656            offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
657            /* for each trail byte for this lead byte */
658            for(j=0; j<256; ++j) {
659                entry=states->stateTable[trailState][j];
660                switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
661                case MBCS_STATE_VALID_16:
662                    entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
663                    if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
664                        ++count[i];
665                    } else {
666                        j=999; /* do not count for this lead byte because there are assignments */
667                    }
668                    break;
669                case MBCS_STATE_VALID_16_PAIR:
670                    entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
671                    if((*pUnicodeCodeUnits)[entry]==0xfffe) {
672                        count[i]+=2;
673                    } else {
674                        j=999; /* do not count for this lead byte because there are assignments */
675                    }
676                    break;
677                default:
678                    break;
679                }
680            }
681            if(j==256) {
682                /* all trail bytes for this lead byte are unassigned */
683                savings+=count[i];
684            } else {
685                count[i]=0;
686            }
687        }
688    }
689    /* subtract from the possible savings the cost of an additional state */
690    savings=savings*2-1024; /* count bytes, not 16-bit words */
691    if(savings<=0) {
692        return;
693    }
694    if(verbose) {
695        printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
696    }
697    if(states->countStates>=MBCS_MAX_STATE_COUNT) {
698        fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
699        return;
700    }
701
702    /* make a copy of the state table */
703    oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
704    if(oldStateTable==NULL) {
705        fprintf(stderr, "cannot compact toUnicode: out of memory\n");
706        return;
707    }
708    uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
709
710    /* add the new state */
711    /*
712     * this function does not catch the degenerate case where all lead bytes
713     * have all-unassigned trail bytes and the lead state could be removed
714     */
715    newState=states->countStates++;
716    states->stateFlags[newState]=0;
717    /* copy the old trail state, turning all assigned states into unassigned ones */
718    for(i=0; i<256; ++i) {
719        entry=states->stateTable[trailState][i];
720        switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
721        case MBCS_STATE_VALID_16:
722        case MBCS_STATE_VALID_16_PAIR:
723            states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
724            break;
725        default:
726            states->stateTable[newState][i]=entry;
727            break;
728        }
729    }
730
731    /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
732    for(i=0; i<256; ++i) {
733        if(count[i]>0) {
734            states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
735        }
736    }
737
738    /* sum up the new state table */
739    for(i=0; i<states->countStates; ++i) {
740        states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
741    }
742    sum=sumUpStates(states);
743
744    /* allocate a new, smaller code units array */
745    oldUnicodeCodeUnits=*pUnicodeCodeUnits;
746    if(sum==0) {
747        *pUnicodeCodeUnits=NULL;
748        if(oldUnicodeCodeUnits!=NULL) {
749            uprv_free(oldUnicodeCodeUnits);
750        }
751        uprv_free(oldStateTable);
752        return;
753    }
754    *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
755    if(*pUnicodeCodeUnits==NULL) {
756        fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
757            (long)sum);
758        /* revert to the old state table */
759        *pUnicodeCodeUnits=oldUnicodeCodeUnits;
760        --states->countStates;
761        uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
762        uprv_free(oldStateTable);
763        return;
764    }
765    for(i=0; i<sum; ++i) {
766        (*pUnicodeCodeUnits)[i]=0xfffe;
767    }
768
769    /* copy the code units for all assigned characters */
770    /*
771     * The old state table has the same lead _and_ trail states for assigned characters!
772     * The differences are in the offsets, and in the trail states for some unassigned characters.
773     * For each character with an assigned state in the new table, it was assigned in the old one.
774     * Only still-assigned characters are copied.
775     * Note that fallback mappings need to get their offset values adjusted.
776     */
777
778    /* for each initial state */
779    for(leadState=0; leadState<states->countStates; ++leadState) {
780        if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
781            /* for each lead byte from there */
782            for(i=0; i<256; ++i) {
783                entry=states->stateTable[leadState][i];
784                if(MBCS_ENTRY_IS_TRANSITION(entry)) {
785                    trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
786                    /* the new state does not have assigned states */
787                    if(trailState!=newState) {
788                        trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
789                        oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
790                        /* for each trail byte */
791                        for(j=0; j<256; ++j) {
792                            entry=states->stateTable[trailState][j];
793                            /* copy assigned-character code units and adjust fallback offsets */
794                            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
795                            case MBCS_STATE_VALID_16:
796                                offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
797                                /* find the old offset according to the old state table */
798                                oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
799                                unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
800                                if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
801                                    toUFallbacks[fallback].offset=0x80000000|offset;
802                                }
803                                break;
804                            case MBCS_STATE_VALID_16_PAIR:
805                                offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
806                                /* find the old offset according to the old state table */
807                                oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
808                                (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
809                                (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
810                                break;
811                            default:
812                                break;
813                            }
814                        }
815                    }
816                }
817            }
818        }
819    }
820
821    /* remove temporary flags from fallback offsets that protected them from being modified twice */
822    for(i=0; i<countToUFallbacks; ++i) {
823        toUFallbacks[i].offset&=0x7fffffff;
824    }
825
826    /* free temporary memory */
827    uprv_free(oldUnicodeCodeUnits);
828    uprv_free(oldStateTable);
829}
830
831/*
832 * recursive sub-function of compactToUnicodeHelper()
833 * returns:
834 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
835 *    if all sequences from this state are unassigned, returns the
836 * <0 there are assignments in unicodeCodeUnits[]
837 * 0  no use of unicodeCodeUnits[]
838 */
839static int32_t
840findUnassigned(UCMStates *states,
841               uint16_t *unicodeCodeUnits,
842               _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
843               int32_t state, int32_t offset, uint32_t b) {
844    int32_t i, entry, savings, localSavings, belowSavings;
845    UBool haveAssigned;
846
847    localSavings=belowSavings=0;
848    haveAssigned=FALSE;
849    for(i=0; i<256; ++i) {
850        entry=states->stateTable[state][i];
851        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
852            savings=findUnassigned(states,
853                        unicodeCodeUnits,
854                        toUFallbacks, countToUFallbacks,
855                        MBCS_ENTRY_TRANSITION_STATE(entry),
856                        offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
857                        (b<<8)|(uint32_t)i);
858            if(savings<0) {
859                haveAssigned=TRUE;
860            } else if(savings>0) {
861                printf("    all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
862                    (unsigned long)((b<<8)|i), (long)state, (long)savings);
863                belowSavings+=savings;
864            }
865        } else if(!haveAssigned) {
866            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
867            case MBCS_STATE_VALID_16:
868                entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
869                if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
870                    localSavings+=2;
871                } else {
872                    haveAssigned=TRUE;
873                }
874                break;
875            case MBCS_STATE_VALID_16_PAIR:
876                entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
877                if(unicodeCodeUnits[entry]==0xfffe) {
878                    localSavings+=4;
879                } else {
880                    haveAssigned=TRUE;
881                }
882                break;
883            default:
884                break;
885            }
886        }
887    }
888    if(haveAssigned) {
889        return -1;
890    } else {
891        return localSavings+belowSavings;
892    }
893}
894
895/* helper function for finding compaction opportunities */
896static void
897compactToUnicodeHelper(UCMStates *states,
898                       uint16_t *unicodeCodeUnits,
899                       _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
900    int32_t state, savings;
901
902    /* for each initial state */
903    for(state=0; state<states->countStates; ++state) {
904        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
905            savings=findUnassigned(states,
906                        unicodeCodeUnits,
907                        toUFallbacks, countToUFallbacks,
908                        state, 0, 0);
909            if(savings>0) {
910                printf("    all-unassigned sequences from initial state %ld use %ld bytes\n",
911                    (long)state, (long)savings);
912            }
913        }
914    }
915}
916
917static int32_t
918compareFallbacks(const void *context, const void *fb1, const void *fb2) {
919    return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
920}
921
922U_CAPI void U_EXPORT2
923ucm_optimizeStates(UCMStates *states,
924                   uint16_t **pUnicodeCodeUnits,
925                   _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
926                   UBool verbose) {
927    UErrorCode errorCode;
928    int32_t state, cell, entry;
929
930    /* test each state table entry */
931    for(state=0; state<states->countStates; ++state) {
932        for(cell=0; cell<256; ++cell) {
933            entry=states->stateTable[state][cell];
934            /*
935             * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
936             * and the code point is "unassigned" (0xfffe), then change it to
937             * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
938             */
939            if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
940                states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
941            }
942        }
943    }
944
945    /* try to compact the toUnicode tables */
946    if(states->maxCharLength==2) {
947        compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
948    } else if(states->maxCharLength>2) {
949        if(verbose) {
950            compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
951        }
952    }
953
954    /* sort toUFallbacks */
955    /*
956     * It should be safe to sort them before compactToUnicode2() is called,
957     * because it should not change the relative order of the offset values
958     * that it adjusts, but they need to be sorted at some point, and
959     * it is safest here.
960     */
961    if(countToUFallbacks>0) {
962        errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
963        uprv_sortArray(toUFallbacks, countToUFallbacks,
964                       sizeof(_MBCSToUFallback),
965                       compareFallbacks, NULL, FALSE, &errorCode);
966    }
967}
968
969/* use a complete state table ----------------------------------------------- */
970
971U_CAPI int32_t U_EXPORT2
972ucm_countChars(UCMStates *states,
973               const uint8_t *bytes, int32_t length) {
974    uint32_t offset;
975    int32_t i, entry, count;
976    uint8_t state;
977
978    offset=0;
979    count=0;
980    state=0;
981
982    if(states->countStates==0) {
983        fprintf(stderr, "ucm error: there is no state information!\n");
984        return -1;
985    }
986
987    /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
988    if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
989        state=1;
990    }
991
992    /*
993     * Walk down the state table like in conversion,
994     * much like getNextUChar().
995     * We assume that c<=0x10ffff.
996     */
997    for(i=0; i<length; ++i) {
998        entry=states->stateTable[state][bytes[i]];
999        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1000            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1001            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1002        } else {
1003            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1004            case MBCS_STATE_ILLEGAL:
1005                fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
1006                return -1;
1007            case MBCS_STATE_CHANGE_ONLY:
1008                fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
1009                return -1;
1010            case MBCS_STATE_UNASSIGNED:
1011            case MBCS_STATE_FALLBACK_DIRECT_16:
1012            case MBCS_STATE_VALID_DIRECT_16:
1013            case MBCS_STATE_FALLBACK_DIRECT_20:
1014            case MBCS_STATE_VALID_DIRECT_20:
1015            case MBCS_STATE_VALID_16:
1016            case MBCS_STATE_VALID_16_PAIR:
1017                /* count a complete character and prepare for a new one */
1018                ++count;
1019                state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1020                offset=0;
1021                break;
1022            default:
1023                /* reserved, must never occur */
1024                fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
1025                return -1;
1026            }
1027        }
1028    }
1029
1030    if(offset!=0) {
1031        fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
1032        return -1;
1033    }
1034
1035    /*
1036     * for SI/SO (like EBCDIC-stateful), multiple-character results
1037     * must consist of only double-byte sequences
1038     */
1039    if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
1040        fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
1041        return -1;
1042    }
1043
1044    return count;
1045}
1046#endif
1047
1048