1/*
2 * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7#define USING_IMPORTED_MAPS
8#define USING_BINARY_PAIR_SEARCH
9#define EXTERN_JISX0213_PAIR
10#define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
11#define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
12
13#include "cjkcodecs.h"
14#include "alg_jisx0201.h"
15#include "emu_jisx0213_2000.h"
16#include "mappings_jisx0213_pair.h"
17
18/* STATE
19
20   state->c[0-3]
21
22    00000000
23    ||^^^^^|
24    |+-----+----  G0-3 Character Set
25    +-----------  Is G0-3 double byte?
26
27   state->c[4]
28
29    00000000
30          ||
31          |+----  Locked-Shift?
32          +-----  ESC Throughout
33*/
34
35#define ESC                     0x1B
36#define SO                      0x0E
37#define SI                      0x0F
38#define LF                      0x0A
39
40#define MAX_ESCSEQLEN           16
41
42#define CHARSET_ISO8859_1       'A'
43#define CHARSET_ASCII           'B'
44#define CHARSET_ISO8859_7       'F'
45#define CHARSET_JISX0201_K      'I'
46#define CHARSET_JISX0201_R      'J'
47
48#define CHARSET_GB2312          ('A'|CHARSET_DBCS)
49#define CHARSET_JISX0208        ('B'|CHARSET_DBCS)
50#define CHARSET_KSX1001         ('C'|CHARSET_DBCS)
51#define CHARSET_JISX0212        ('D'|CHARSET_DBCS)
52#define CHARSET_GB2312_8565     ('E'|CHARSET_DBCS)
53#define CHARSET_CNS11643_1      ('G'|CHARSET_DBCS)
54#define CHARSET_CNS11643_2      ('H'|CHARSET_DBCS)
55#define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
56#define CHARSET_JISX0213_2      ('P'|CHARSET_DBCS)
57#define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
58#define CHARSET_JISX0208_O      ('@'|CHARSET_DBCS)
59
60#define CHARSET_DBCS            0x80
61#define ESCMARK(mark)           ((mark) & 0x7f)
62
63#define IS_ESCEND(c)    (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
64#define IS_ISO2022ESC(c2) \
65        ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
66         (c2) == '.' || (c2) == '&')
67    /* this is not a complete list of ISO-2022 escape sequence headers.
68     * but, it's enough to implement CJK instances of iso-2022. */
69
70#define MAP_UNMAPPABLE          0xFFFF
71#define MAP_MULTIPLE_AVAIL      0xFFFE /* for JIS X 0213 */
72
73#define F_SHIFTED               0x01
74#define F_ESCTHROUGHOUT         0x02
75
76#define STATE_SETG(dn, v)       ((state)->c[dn]) = (v);
77#define STATE_GETG(dn)          ((state)->c[dn])
78
79#define STATE_G0                STATE_GETG(0)
80#define STATE_G1                STATE_GETG(1)
81#define STATE_G2                STATE_GETG(2)
82#define STATE_G3                STATE_GETG(3)
83#define STATE_SETG0(v)          STATE_SETG(0, v)
84#define STATE_SETG1(v)          STATE_SETG(1, v)
85#define STATE_SETG2(v)          STATE_SETG(2, v)
86#define STATE_SETG3(v)          STATE_SETG(3, v)
87
88#define STATE_SETFLAG(f)        ((state)->c[4]) |= (f);
89#define STATE_GETFLAG(f)        ((state)->c[4] & (f))
90#define STATE_CLEARFLAG(f)      ((state)->c[4]) &= ~(f);
91#define STATE_CLEARFLAGS()      ((state)->c[4]) = 0;
92
93#define ISO2022_CONFIG          ((const struct iso2022_config *)config)
94#define CONFIG_ISSET(flag)      (ISO2022_CONFIG->flags & (flag))
95#define CONFIG_DESIGNATIONS     (ISO2022_CONFIG->designations)
96
97/* iso2022_config.flags */
98#define NO_SHIFT                0x01
99#define USE_G2                  0x02
100#define USE_JISX0208_EXT        0x04
101
102/*-*- internal data structures -*-*/
103
104typedef int (*iso2022_init_func)(void);
105typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data);
106typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length);
107
108struct iso2022_designation {
109    unsigned char mark;
110    unsigned char plane;
111    unsigned char width;
112    iso2022_init_func initializer;
113    iso2022_decode_func decoder;
114    iso2022_encode_func encoder;
115};
116
117struct iso2022_config {
118    int flags;
119    const struct iso2022_designation *designations; /* non-ascii desigs */
120};
121
122/*-*- iso-2022 codec implementation -*-*/
123
124CODEC_INIT(iso2022)
125{
126    const struct iso2022_designation *desig = CONFIG_DESIGNATIONS;
127    for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
128        if (desig->initializer != NULL && desig->initializer() != 0)
129            return -1;
130    return 0;
131}
132
133ENCODER_INIT(iso2022)
134{
135    STATE_CLEARFLAGS()
136    STATE_SETG0(CHARSET_ASCII)
137    STATE_SETG1(CHARSET_ASCII)
138    return 0;
139}
140
141ENCODER_RESET(iso2022)
142{
143    if (STATE_GETFLAG(F_SHIFTED)) {
144        WRITE1(SI)
145        NEXT_OUT(1)
146        STATE_CLEARFLAG(F_SHIFTED)
147    }
148    if (STATE_G0 != CHARSET_ASCII) {
149        WRITE3(ESC, '(', 'B')
150        NEXT_OUT(3)
151        STATE_SETG0(CHARSET_ASCII)
152    }
153    return 0;
154}
155
156ENCODER(iso2022)
157{
158    while (inleft > 0) {
159        const struct iso2022_designation *dsg;
160        DBCHAR encoded;
161        ucs4_t c = **inbuf;
162        Py_ssize_t insize;
163
164        if (c < 0x80) {
165            if (STATE_G0 != CHARSET_ASCII) {
166                WRITE3(ESC, '(', 'B')
167                STATE_SETG0(CHARSET_ASCII)
168                NEXT_OUT(3)
169            }
170            if (STATE_GETFLAG(F_SHIFTED)) {
171                WRITE1(SI)
172                STATE_CLEARFLAG(F_SHIFTED)
173                NEXT_OUT(1)
174            }
175            WRITE1((unsigned char)c)
176            NEXT(1, 1)
177            continue;
178        }
179
180        DECODE_SURROGATE(c)
181        insize = GET_INSIZE(c);
182
183        encoded = MAP_UNMAPPABLE;
184        for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
185            Py_ssize_t length = 1;
186            encoded = dsg->encoder(&c, &length);
187            if (encoded == MAP_MULTIPLE_AVAIL) {
188                /* this implementation won't work for pair
189                 * of non-bmp characters. */
190                if (inleft < 2) {
191                    if (!(flags & MBENC_FLUSH))
192                        return MBERR_TOOFEW;
193                    length = -1;
194                }
195                else
196                    length = 2;
197#if Py_UNICODE_SIZE == 2
198                if (length == 2) {
199                    ucs4_t u4in[2];
200                    u4in[0] = (ucs4_t)IN1;
201                    u4in[1] = (ucs4_t)IN2;
202                    encoded = dsg->encoder(u4in, &length);
203                } else
204                    encoded = dsg->encoder(&c, &length);
205#else
206                encoded = dsg->encoder(&c, &length);
207#endif
208                if (encoded != MAP_UNMAPPABLE) {
209                    insize = length;
210                    break;
211                }
212            }
213            else if (encoded != MAP_UNMAPPABLE)
214                break;
215        }
216
217        if (!dsg->mark)
218            return 1;
219        assert(dsg->width == 1 || dsg->width == 2);
220
221        switch (dsg->plane) {
222        case 0: /* G0 */
223            if (STATE_GETFLAG(F_SHIFTED)) {
224                WRITE1(SI)
225                STATE_CLEARFLAG(F_SHIFTED)
226                NEXT_OUT(1)
227            }
228            if (STATE_G0 != dsg->mark) {
229                if (dsg->width == 1) {
230                    WRITE3(ESC, '(', ESCMARK(dsg->mark))
231                    STATE_SETG0(dsg->mark)
232                    NEXT_OUT(3)
233                }
234                else if (dsg->mark == CHARSET_JISX0208) {
235                    WRITE3(ESC, '$', ESCMARK(dsg->mark))
236                    STATE_SETG0(dsg->mark)
237                    NEXT_OUT(3)
238                }
239                else {
240                    WRITE4(ESC, '$', '(',
241                        ESCMARK(dsg->mark))
242                    STATE_SETG0(dsg->mark)
243                    NEXT_OUT(4)
244                }
245            }
246            break;
247        case 1: /* G1 */
248            if (STATE_G1 != dsg->mark) {
249                if (dsg->width == 1) {
250                    WRITE3(ESC, ')', ESCMARK(dsg->mark))
251                    STATE_SETG1(dsg->mark)
252                    NEXT_OUT(3)
253                }
254                else {
255                    WRITE4(ESC, '$', ')',
256                        ESCMARK(dsg->mark))
257                    STATE_SETG1(dsg->mark)
258                    NEXT_OUT(4)
259                }
260            }
261            if (!STATE_GETFLAG(F_SHIFTED)) {
262                WRITE1(SO)
263                STATE_SETFLAG(F_SHIFTED)
264                NEXT_OUT(1)
265            }
266            break;
267        default: /* G2 and G3 is not supported: no encoding in
268                  * CJKCodecs are using them yet */
269            return MBERR_INTERNAL;
270        }
271
272        if (dsg->width == 1) {
273            WRITE1((unsigned char)encoded)
274            NEXT_OUT(1)
275        }
276        else {
277            WRITE2(encoded >> 8, encoded & 0xff)
278            NEXT_OUT(2)
279        }
280        NEXT_IN(insize)
281    }
282
283    return 0;
284}
285
286DECODER_INIT(iso2022)
287{
288    STATE_CLEARFLAGS()
289    STATE_SETG0(CHARSET_ASCII)
290    STATE_SETG1(CHARSET_ASCII)
291    STATE_SETG2(CHARSET_ASCII)
292    return 0;
293}
294
295DECODER_RESET(iso2022)
296{
297    STATE_SETG0(CHARSET_ASCII)
298    STATE_CLEARFLAG(F_SHIFTED)
299    return 0;
300}
301
302static Py_ssize_t
303iso2022processesc(const void *config, MultibyteCodec_State *state,
304                  const unsigned char **inbuf, Py_ssize_t *inleft)
305{
306    unsigned char charset, designation;
307    Py_ssize_t i, esclen;
308
309    for (i = 1;i < MAX_ESCSEQLEN;i++) {
310        if (i >= *inleft)
311            return MBERR_TOOFEW;
312        if (IS_ESCEND((*inbuf)[i])) {
313            esclen = i + 1;
314            break;
315        }
316        else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
317                 (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
318            i += 2;
319    }
320
321    if (i >= MAX_ESCSEQLEN)
322        return 1; /* unterminated escape sequence */
323
324    switch (esclen) {
325    case 3:
326        if (IN2 == '$') {
327            charset = IN3 | CHARSET_DBCS;
328            designation = 0;
329        }
330        else {
331            charset = IN3;
332            if (IN2 == '(') designation = 0;
333            else if (IN2 == ')') designation = 1;
334            else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
335                designation = 2;
336            else return 3;
337        }
338        break;
339    case 4:
340        if (IN2 != '$')
341            return 4;
342
343        charset = IN4 | CHARSET_DBCS;
344        if (IN3 == '(') designation = 0;
345        else if (IN3 == ')') designation = 1;
346        else return 4;
347        break;
348    case 6: /* designation with prefix */
349        if (CONFIG_ISSET(USE_JISX0208_EXT) &&
350            (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
351            (*inbuf)[5] == 'B') {
352            charset = 'B' | CHARSET_DBCS;
353            designation = 0;
354        }
355        else
356            return 6;
357        break;
358    default:
359        return esclen;
360    }
361
362    /* raise error when the charset is not designated for this encoding */
363    if (charset != CHARSET_ASCII) {
364        const struct iso2022_designation *dsg;
365
366        for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++)
367            if (dsg->mark == charset)
368                break;
369        if (!dsg->mark)
370            return esclen;
371    }
372
373    STATE_SETG(designation, charset)
374    *inleft -= esclen;
375    (*inbuf) += esclen;
376    return 0;
377}
378
379#define ISO8859_7_DECODE(c, assi)                                       \
380    if ((c) < 0xa0) (assi) = (c);                                       \
381    else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0))))          \
382        (assi) = (c);                                                   \
383    else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||              \
384             (0xbffffd77L & (1L << ((c)-0xb4)))))                       \
385        (assi) = 0x02d0 + (c);                                          \
386    else if ((c) == 0xa1) (assi) = 0x2018;                              \
387    else if ((c) == 0xa2) (assi) = 0x2019;                              \
388    else if ((c) == 0xaf) (assi) = 0x2015;
389
390static Py_ssize_t
391iso2022processg2(const void *config, MultibyteCodec_State *state,
392                 const unsigned char **inbuf, Py_ssize_t *inleft,
393                 Py_UNICODE **outbuf, Py_ssize_t *outleft)
394{
395    /* not written to use encoder, decoder functions because only few
396     * encodings use G2 designations in CJKCodecs */
397    if (STATE_G2 == CHARSET_ISO8859_1) {
398        if (IN3 < 0x80)
399            OUT1(IN3 + 0x80)
400        else
401            return 3;
402    }
403    else if (STATE_G2 == CHARSET_ISO8859_7) {
404        ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)
405        else return 3;
406    }
407    else if (STATE_G2 == CHARSET_ASCII) {
408        if (IN3 & 0x80) return 3;
409        else **outbuf = IN3;
410    }
411    else
412        return MBERR_INTERNAL;
413
414    (*inbuf) += 3;
415    *inleft -= 3;
416    (*outbuf) += 1;
417    *outleft -= 1;
418    return 0;
419}
420
421DECODER(iso2022)
422{
423    const struct iso2022_designation *dsgcache = NULL;
424
425    while (inleft > 0) {
426        unsigned char c = IN1;
427        Py_ssize_t err;
428
429        if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
430            /* ESC throughout mode:
431             * for non-iso2022 escape sequences */
432            WRITE1(c) /* assume as ISO-8859-1 */
433            NEXT(1, 1)
434            if (IS_ESCEND(c)) {
435                STATE_CLEARFLAG(F_ESCTHROUGHOUT)
436            }
437            continue;
438        }
439
440        switch (c) {
441        case ESC:
442            REQUIRE_INBUF(2)
443            if (IS_ISO2022ESC(IN2)) {
444                err = iso2022processesc(config, state,
445                                        inbuf, &inleft);
446                if (err != 0)
447                    return err;
448            }
449            else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
450                REQUIRE_INBUF(3)
451                err = iso2022processg2(config, state,
452                    inbuf, &inleft, outbuf, &outleft);
453                if (err != 0)
454                    return err;
455            }
456            else {
457                WRITE1(ESC)
458                STATE_SETFLAG(F_ESCTHROUGHOUT)
459                NEXT(1, 1)
460            }
461            break;
462        case SI:
463            if (CONFIG_ISSET(NO_SHIFT))
464                goto bypass;
465            STATE_CLEARFLAG(F_SHIFTED)
466            NEXT_IN(1)
467            break;
468        case SO:
469            if (CONFIG_ISSET(NO_SHIFT))
470                goto bypass;
471            STATE_SETFLAG(F_SHIFTED)
472            NEXT_IN(1)
473            break;
474        case LF:
475            STATE_CLEARFLAG(F_SHIFTED)
476            WRITE1(LF)
477            NEXT(1, 1)
478            break;
479        default:
480            if (c < 0x20) /* C0 */
481                goto bypass;
482            else if (c >= 0x80)
483                return 1;
484            else {
485                const struct iso2022_designation *dsg;
486                unsigned char charset;
487                ucs4_t decoded;
488
489                if (STATE_GETFLAG(F_SHIFTED))
490                    charset = STATE_G1;
491                else
492                    charset = STATE_G0;
493
494                if (charset == CHARSET_ASCII) {
495bypass:                                 WRITE1(c)
496                                        NEXT(1, 1)
497                                        break;
498                                }
499
500                                if (dsgcache != NULL &&
501                                    dsgcache->mark == charset)
502                                        dsg = dsgcache;
503                                else {
504                                        for (dsg = CONFIG_DESIGNATIONS;
505                                             dsg->mark != charset
506#ifdef Py_DEBUG
507                                                && dsg->mark != '\0'
508#endif
509                                             ;dsg++)
510                                                /* noop */;
511                                        assert(dsg->mark != '\0');
512                                        dsgcache = dsg;
513                                }
514
515                                REQUIRE_INBUF(dsg->width)
516                                decoded = dsg->decoder(*inbuf);
517                                if (decoded == MAP_UNMAPPABLE)
518                                        return dsg->width;
519
520                                if (decoded < 0x10000) {
521                                        WRITE1(decoded)
522                                        NEXT_OUT(1)
523                                }
524                                else if (decoded < 0x30000) {
525                                        WRITEUCS4(decoded)
526                                }
527                                else { /* JIS X 0213 pairs */
528                    WRITE2(decoded >> 16, decoded & 0xffff)
529                    NEXT_OUT(2)
530                }
531                NEXT_IN(dsg->width)
532            }
533            break;
534        }
535    }
536    return 0;
537}
538
539/*-*- mapping table holders -*-*/
540
541#define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
542#define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
543
544/* kr */
545ENCMAP(cp949)
546DECMAP(ksx1001)
547
548/* jp */
549ENCMAP(jisxcommon)
550DECMAP(jisx0208)
551DECMAP(jisx0212)
552ENCMAP(jisx0213_bmp)
553DECMAP(jisx0213_1_bmp)
554DECMAP(jisx0213_2_bmp)
555ENCMAP(jisx0213_emp)
556DECMAP(jisx0213_1_emp)
557DECMAP(jisx0213_2_emp)
558
559/* cn */
560ENCMAP(gbcommon)
561DECMAP(gb2312)
562
563/* tw */
564
565/*-*- mapping access functions -*-*/
566
567static int
568ksx1001_init(void)
569{
570    static int initialized = 0;
571
572    if (!initialized && (
573                    IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
574                    IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
575        return -1;
576    initialized = 1;
577    return 0;
578}
579
580static ucs4_t
581ksx1001_decoder(const unsigned char *data)
582{
583    ucs4_t u;
584    TRYMAP_DEC(ksx1001, u, data[0], data[1])
585        return u;
586    else
587        return MAP_UNMAPPABLE;
588}
589
590static DBCHAR
591ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length)
592{
593    DBCHAR coded;
594    assert(*length == 1);
595    if (*data < 0x10000) {
596        TRYMAP_ENC(cp949, coded, *data)
597            if (!(coded & 0x8000))
598                return coded;
599    }
600    return MAP_UNMAPPABLE;
601}
602
603static int
604jisx0208_init(void)
605{
606    static int initialized = 0;
607
608    if (!initialized && (
609                    IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
610                    IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
611        return -1;
612    initialized = 1;
613    return 0;
614}
615
616static ucs4_t
617jisx0208_decoder(const unsigned char *data)
618{
619    ucs4_t u;
620    if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
621        return 0xff3c;
622    else TRYMAP_DEC(jisx0208, u, data[0], data[1])
623        return u;
624    else
625        return MAP_UNMAPPABLE;
626}
627
628static DBCHAR
629jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length)
630{
631    DBCHAR coded;
632    assert(*length == 1);
633    if (*data < 0x10000) {
634        if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
635            return 0x2140;
636        else TRYMAP_ENC(jisxcommon, coded, *data) {
637            if (!(coded & 0x8000))
638                return coded;
639        }
640    }
641    return MAP_UNMAPPABLE;
642}
643
644static int
645jisx0212_init(void)
646{
647    static int initialized = 0;
648
649    if (!initialized && (
650                    IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
651                    IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
652        return -1;
653    initialized = 1;
654    return 0;
655}
656
657static ucs4_t
658jisx0212_decoder(const unsigned char *data)
659{
660    ucs4_t u;
661    TRYMAP_DEC(jisx0212, u, data[0], data[1])
662        return u;
663    else
664        return MAP_UNMAPPABLE;
665}
666
667static DBCHAR
668jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length)
669{
670    DBCHAR coded;
671    assert(*length == 1);
672    if (*data < 0x10000) {
673        TRYMAP_ENC(jisxcommon, coded, *data) {
674            if (coded & 0x8000)
675                return coded & 0x7fff;
676        }
677    }
678    return MAP_UNMAPPABLE;
679}
680
681static int
682jisx0213_init(void)
683{
684    static int initialized = 0;
685
686    if (!initialized && (
687                    jisx0208_init() ||
688                    IMPORT_MAP(jp, jisx0213_bmp,
689                               &jisx0213_bmp_encmap, NULL) ||
690                    IMPORT_MAP(jp, jisx0213_1_bmp,
691                               NULL, &jisx0213_1_bmp_decmap) ||
692                    IMPORT_MAP(jp, jisx0213_2_bmp,
693                               NULL, &jisx0213_2_bmp_decmap) ||
694                    IMPORT_MAP(jp, jisx0213_emp,
695                               &jisx0213_emp_encmap, NULL) ||
696                    IMPORT_MAP(jp, jisx0213_1_emp,
697                               NULL, &jisx0213_1_emp_decmap) ||
698                    IMPORT_MAP(jp, jisx0213_2_emp,
699                               NULL, &jisx0213_2_emp_decmap) ||
700                    IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
701                               &jisx0213_pair_decmap)))
702        return -1;
703    initialized = 1;
704    return 0;
705}
706
707#define config ((void *)2000)
708static ucs4_t
709jisx0213_2000_1_decoder(const unsigned char *data)
710{
711    ucs4_t u;
712    EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
713    else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
714        return 0xff3c;
715    else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
716    else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
717    else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
718        u |= 0x20000;
719    else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
720    else
721        return MAP_UNMAPPABLE;
722    return u;
723}
724
725static ucs4_t
726jisx0213_2000_2_decoder(const unsigned char *data)
727{
728    ucs4_t u;
729    EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1])
730    TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
731    else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
732        u |= 0x20000;
733    else
734        return MAP_UNMAPPABLE;
735    return u;
736}
737#undef config
738
739static ucs4_t
740jisx0213_2004_1_decoder(const unsigned char *data)
741{
742    ucs4_t u;
743    if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
744        return 0xff3c;
745    else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
746    else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
747    else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
748        u |= 0x20000;
749    else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
750    else
751        return MAP_UNMAPPABLE;
752    return u;
753}
754
755static ucs4_t
756jisx0213_2004_2_decoder(const unsigned char *data)
757{
758    ucs4_t u;
759    TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
760    else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
761        u |= 0x20000;
762    else
763        return MAP_UNMAPPABLE;
764    return u;
765}
766
767static DBCHAR
768jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
769{
770    DBCHAR coded;
771
772    switch (*length) {
773    case 1: /* first character */
774        if (*data >= 0x10000) {
775            if ((*data) >> 16 == 0x20000 >> 16) {
776                EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
777                else TRYMAP_ENC(jisx0213_emp, coded,
778                                (*data) & 0xffff)
779                    return coded;
780            }
781            return MAP_UNMAPPABLE;
782        }
783
784        EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
785        else TRYMAP_ENC(jisx0213_bmp, coded, *data) {
786            if (coded == MULTIC)
787                return MAP_MULTIPLE_AVAIL;
788        }
789        else TRYMAP_ENC(jisxcommon, coded, *data) {
790            if (coded & 0x8000)
791                return MAP_UNMAPPABLE;
792        }
793        else
794            return MAP_UNMAPPABLE;
795        return coded;
796    case 2: /* second character of unicode pair */
797        coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
798                        jisx0213_pair_encmap, JISX0213_ENCPAIRS);
799        if (coded == DBCINV) {
800            *length = 1;
801            coded = find_pairencmap((ucs2_t)data[0], 0,
802                      jisx0213_pair_encmap, JISX0213_ENCPAIRS);
803            if (coded == DBCINV)
804                return MAP_UNMAPPABLE;
805        }
806        else
807            return coded;
808    case -1: /* flush unterminated */
809        *length = 1;
810        coded = find_pairencmap((ucs2_t)data[0], 0,
811                        jisx0213_pair_encmap, JISX0213_ENCPAIRS);
812        if (coded == DBCINV)
813            return MAP_UNMAPPABLE;
814        else
815            return coded;
816    default:
817        return MAP_UNMAPPABLE;
818    }
819}
820
821static DBCHAR
822jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
823{
824    DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
825    if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
826        return coded;
827    else if (coded & 0x8000)
828        return MAP_UNMAPPABLE;
829    else
830        return coded;
831}
832
833static DBCHAR
834jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
835{
836    DBCHAR coded;
837    Py_ssize_t ilength = *length;
838
839    coded = jisx0213_encoder(data, length, (void *)2000);
840    switch (ilength) {
841    case 1:
842        if (coded == MAP_MULTIPLE_AVAIL)
843            return MAP_MULTIPLE_AVAIL;
844        else
845            return MAP_UNMAPPABLE;
846    case 2:
847        if (*length != 2)
848            return MAP_UNMAPPABLE;
849        else
850            return coded;
851    default:
852        return MAP_UNMAPPABLE;
853    }
854}
855
856static DBCHAR
857jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
858{
859    DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
860    if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
861        return coded;
862    else if (coded & 0x8000)
863        return coded & 0x7fff;
864    else
865        return MAP_UNMAPPABLE;
866}
867
868static DBCHAR
869jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
870{
871    DBCHAR coded = jisx0213_encoder(data, length, NULL);
872    if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
873        return coded;
874    else if (coded & 0x8000)
875        return MAP_UNMAPPABLE;
876    else
877        return coded;
878}
879
880static DBCHAR
881jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
882{
883    DBCHAR coded;
884    Py_ssize_t ilength = *length;
885
886    coded = jisx0213_encoder(data, length, NULL);
887    switch (ilength) {
888    case 1:
889        if (coded == MAP_MULTIPLE_AVAIL)
890            return MAP_MULTIPLE_AVAIL;
891        else
892            return MAP_UNMAPPABLE;
893    case 2:
894        if (*length != 2)
895            return MAP_UNMAPPABLE;
896        else
897            return coded;
898    default:
899        return MAP_UNMAPPABLE;
900    }
901}
902
903static DBCHAR
904jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
905{
906    DBCHAR coded = jisx0213_encoder(data, length, NULL);
907    if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
908        return coded;
909    else if (coded & 0x8000)
910        return coded & 0x7fff;
911    else
912        return MAP_UNMAPPABLE;
913}
914
915static ucs4_t
916jisx0201_r_decoder(const unsigned char *data)
917{
918    ucs4_t u;
919    JISX0201_R_DECODE(*data, u)
920    else return MAP_UNMAPPABLE;
921    return u;
922}
923
924static DBCHAR
925jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
926{
927    DBCHAR coded;
928    JISX0201_R_ENCODE(*data, coded)
929    else return MAP_UNMAPPABLE;
930    return coded;
931}
932
933static ucs4_t
934jisx0201_k_decoder(const unsigned char *data)
935{
936    ucs4_t u;
937    JISX0201_K_DECODE(*data ^ 0x80, u)
938    else return MAP_UNMAPPABLE;
939    return u;
940}
941
942static DBCHAR
943jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length)
944{
945    DBCHAR coded;
946    JISX0201_K_ENCODE(*data, coded)
947    else return MAP_UNMAPPABLE;
948    return coded - 0x80;
949}
950
951static int
952gb2312_init(void)
953{
954    static int initialized = 0;
955
956    if (!initialized && (
957                    IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
958                    IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
959        return -1;
960    initialized = 1;
961    return 0;
962}
963
964static ucs4_t
965gb2312_decoder(const unsigned char *data)
966{
967    ucs4_t u;
968    TRYMAP_DEC(gb2312, u, data[0], data[1])
969        return u;
970    else
971        return MAP_UNMAPPABLE;
972}
973
974static DBCHAR
975gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
976{
977    DBCHAR coded;
978    assert(*length == 1);
979    if (*data < 0x10000) {
980        TRYMAP_ENC(gbcommon, coded, *data) {
981            if (!(coded & 0x8000))
982                return coded;
983        }
984    }
985    return MAP_UNMAPPABLE;
986}
987
988
989static ucs4_t
990dummy_decoder(const unsigned char *data)
991{
992    return MAP_UNMAPPABLE;
993}
994
995static DBCHAR
996dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
997{
998    return MAP_UNMAPPABLE;
999}
1000
1001/*-*- registry tables -*-*/
1002
1003#define REGISTRY_KSX1001_G0     { CHARSET_KSX1001, 0, 2,                \
1004                  ksx1001_init,                                         \
1005                  ksx1001_decoder, ksx1001_encoder }
1006#define REGISTRY_KSX1001_G1     { CHARSET_KSX1001, 1, 2,                \
1007                  ksx1001_init,                                         \
1008                  ksx1001_decoder, ksx1001_encoder }
1009#define REGISTRY_JISX0201_R     { CHARSET_JISX0201_R, 0, 1,             \
1010                  NULL,                                                 \
1011                  jisx0201_r_decoder, jisx0201_r_encoder }
1012#define REGISTRY_JISX0201_K     { CHARSET_JISX0201_K, 0, 1,             \
1013                  NULL,                                                 \
1014                  jisx0201_k_decoder, jisx0201_k_encoder }
1015#define REGISTRY_JISX0208       { CHARSET_JISX0208, 0, 2,               \
1016                  jisx0208_init,                                        \
1017                  jisx0208_decoder, jisx0208_encoder }
1018#define REGISTRY_JISX0208_O     { CHARSET_JISX0208_O, 0, 2,             \
1019                  jisx0208_init,                                        \
1020                  jisx0208_decoder, jisx0208_encoder }
1021#define REGISTRY_JISX0212       { CHARSET_JISX0212, 0, 2,               \
1022                  jisx0212_init,                                        \
1023                  jisx0212_decoder, jisx0212_encoder }
1024#define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,       \
1025                  jisx0213_init,                                        \
1026                  jisx0213_2000_1_decoder,                              \
1027                  jisx0213_2000_1_encoder }
1028#define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
1029                  jisx0213_init,                                        \
1030                  jisx0213_2000_1_decoder,                              \
1031                  jisx0213_2000_1_encoder_paironly }
1032#define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,            \
1033                  jisx0213_init,                                        \
1034                  jisx0213_2000_2_decoder,                              \
1035                  jisx0213_2000_2_encoder }
1036#define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,       \
1037                  jisx0213_init,                                        \
1038                  jisx0213_2004_1_decoder,                              \
1039                  jisx0213_2004_1_encoder }
1040#define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
1041                  jisx0213_init,                                        \
1042                  jisx0213_2004_1_decoder,                              \
1043                  jisx0213_2004_1_encoder_paironly }
1044#define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,            \
1045                  jisx0213_init,                                        \
1046                  jisx0213_2004_2_decoder,                              \
1047                  jisx0213_2004_2_encoder }
1048#define REGISTRY_GB2312         { CHARSET_GB2312, 0, 2,                 \
1049                  gb2312_init,                                          \
1050                  gb2312_decoder, gb2312_encoder }
1051#define REGISTRY_CNS11643_1     { CHARSET_CNS11643_1, 1, 2,             \
1052                  cns11643_init,                                        \
1053                  cns11643_1_decoder, cns11643_1_encoder }
1054#define REGISTRY_CNS11643_2     { CHARSET_CNS11643_2, 2, 2,             \
1055                  cns11643_init,                                        \
1056                  cns11643_2_decoder, cns11643_2_encoder }
1057#define REGISTRY_ISO8859_1      { CHARSET_ISO8859_1, 2, 1,              \
1058                  NULL, dummy_decoder, dummy_encoder }
1059#define REGISTRY_ISO8859_7      { CHARSET_ISO8859_7, 2, 1,              \
1060                  NULL, dummy_decoder, dummy_encoder }
1061#define REGISTRY_SENTINEL       { 0, }
1062#define CONFIGDEF(var, attrs)                                           \
1063    static const struct iso2022_config iso2022_##var##_config = {       \
1064        attrs, iso2022_##var##_designations                             \
1065    };
1066
1067static const struct iso2022_designation iso2022_kr_designations[] = {
1068    REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
1069};
1070CONFIGDEF(kr, 0)
1071
1072static const struct iso2022_designation iso2022_jp_designations[] = {
1073    REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1074    REGISTRY_SENTINEL
1075};
1076CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
1077
1078static const struct iso2022_designation iso2022_jp_1_designations[] = {
1079    REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1080    REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1081};
1082CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
1083
1084static const struct iso2022_designation iso2022_jp_2_designations[] = {
1085    REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
1086    REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1087    REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
1088};
1089CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
1090
1091static const struct iso2022_designation iso2022_jp_2004_designations[] = {
1092    REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
1093    REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
1094};
1095CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
1096
1097static const struct iso2022_designation iso2022_jp_3_designations[] = {
1098    REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
1099    REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
1100};
1101CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
1102
1103static const struct iso2022_designation iso2022_jp_ext_designations[] = {
1104    REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1105    REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1106};
1107CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
1108
1109
1110BEGIN_MAPPINGS_LIST
1111  /* no mapping table here */
1112END_MAPPINGS_LIST
1113
1114#define ISO2022_CODEC(variation) {              \
1115    "iso2022_" #variation,                      \
1116    &iso2022_##variation##_config,              \
1117    iso2022_codec_init,                         \
1118    _STATEFUL_METHODS(iso2022)                  \
1119},
1120
1121BEGIN_CODECS_LIST
1122  ISO2022_CODEC(kr)
1123  ISO2022_CODEC(jp)
1124  ISO2022_CODEC(jp_1)
1125  ISO2022_CODEC(jp_2)
1126  ISO2022_CODEC(jp_2004)
1127  ISO2022_CODEC(jp_3)
1128  ISO2022_CODEC(jp_ext)
1129END_CODECS_LIST
1130
1131I_AM_A_MODULE_FOR(iso2022)
1132