1/*
2 * _codecs_jp.c: Codecs collection for Japanese encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7#define USING_BINARY_PAIR_SEARCH
8#define EMPBASE 0x20000
9
10#include "cjkcodecs.h"
11#include "mappings_jp.h"
12#include "mappings_jisx0213_pair.h"
13#include "alg_jisx0201.h"
14#include "emu_jisx0213_2000.h"
15
16/*
17 * CP932 codec
18 */
19
20ENCODER(cp932)
21{
22    while (inleft > 0) {
23        Py_UNICODE c = IN1;
24        DBCHAR code;
25        unsigned char c1, c2;
26
27        if (c <= 0x80) {
28            WRITE1((unsigned char)c)
29            NEXT(1, 1)
30            continue;
31        }
32        else if (c >= 0xff61 && c <= 0xff9f) {
33            WRITE1(c - 0xfec0)
34            NEXT(1, 1)
35            continue;
36        }
37        else if (c >= 0xf8f0 && c <= 0xf8f3) {
38            /* Windows compatibility */
39            REQUIRE_OUTBUF(1)
40            if (c == 0xf8f0)
41                OUT1(0xa0)
42            else
43                OUT1(c - 0xfef1 + 0xfd)
44            NEXT(1, 1)
45            continue;
46        }
47
48        UCS4INVALID(c)
49        REQUIRE_OUTBUF(2)
50
51        TRYMAP_ENC(cp932ext, code, c) {
52            OUT1(code >> 8)
53            OUT2(code & 0xff)
54        }
55        else TRYMAP_ENC(jisxcommon, code, c) {
56            if (code & 0x8000) /* MSB set: JIS X 0212 */
57                return 1;
58
59            /* JIS X 0208 */
60            c1 = code >> 8;
61            c2 = code & 0xff;
62            c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
63            c1 = (c1 - 0x21) >> 1;
64            OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
65            OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
66        }
67        else if (c >= 0xe000 && c < 0xe758) {
68            /* User-defined area */
69            c1 = (Py_UNICODE)(c - 0xe000) / 188;
70            c2 = (Py_UNICODE)(c - 0xe000) % 188;
71            OUT1(c1 + 0xf0)
72            OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
73        }
74        else
75            return 1;
76
77        NEXT(1, 2)
78    }
79
80    return 0;
81}
82
83DECODER(cp932)
84{
85    while (inleft > 0) {
86        unsigned char c = IN1, c2;
87
88        REQUIRE_OUTBUF(1)
89        if (c <= 0x80) {
90            OUT1(c)
91            NEXT(1, 1)
92            continue;
93        }
94        else if (c >= 0xa0 && c <= 0xdf) {
95            if (c == 0xa0)
96                OUT1(0xf8f0) /* half-width katakana */
97            else
98                OUT1(0xfec0 + c)
99            NEXT(1, 1)
100            continue;
101        }
102        else if (c >= 0xfd/* && c <= 0xff*/) {
103            /* Windows compatibility */
104            OUT1(0xf8f1 - 0xfd + c)
105            NEXT(1, 1)
106            continue;
107        }
108
109        REQUIRE_INBUF(2)
110        c2 = IN2;
111
112        TRYMAP_DEC(cp932ext, **outbuf, c, c2);
113        else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
114            if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
115                return 2;
116
117            c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
118            c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
119            c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21);
120            c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
121
122            TRYMAP_DEC(jisx0208, **outbuf, c, c2);
123            else return 2;
124        }
125        else if (c >= 0xf0 && c <= 0xf9) {
126            if ((c2 >= 0x40 && c2 <= 0x7e) ||
127                (c2 >= 0x80 && c2 <= 0xfc))
128                OUT1(0xe000 + 188 * (c - 0xf0) +
129                     (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
130            else
131                return 2;
132        }
133        else
134            return 2;
135
136        NEXT(2, 1)
137    }
138
139    return 0;
140}
141
142
143/*
144 * EUC-JIS-2004 codec
145 */
146
147ENCODER(euc_jis_2004)
148{
149    while (inleft > 0) {
150        ucs4_t c = IN1;
151        DBCHAR code;
152        Py_ssize_t insize;
153
154        if (c < 0x80) {
155            WRITE1(c)
156            NEXT(1, 1)
157            continue;
158        }
159
160        DECODE_SURROGATE(c)
161        insize = GET_INSIZE(c);
162
163        if (c <= 0xFFFF) {
164            EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
165            else TRYMAP_ENC(jisx0213_bmp, code, c) {
166                if (code == MULTIC) {
167                    if (inleft < 2) {
168                        if (flags & MBENC_FLUSH) {
169                            code = find_pairencmap(
170                                (ucs2_t)c, 0,
171                              jisx0213_pair_encmap,
172                                JISX0213_ENCPAIRS);
173                            if (code == DBCINV)
174                                return 1;
175                        }
176                        else
177                            return MBERR_TOOFEW;
178                    }
179                    else {
180                        code = find_pairencmap(
181                            (ucs2_t)c, (*inbuf)[1],
182                            jisx0213_pair_encmap,
183                            JISX0213_ENCPAIRS);
184                        if (code == DBCINV) {
185                            code = find_pairencmap(
186                                (ucs2_t)c, 0,
187                              jisx0213_pair_encmap,
188                                JISX0213_ENCPAIRS);
189                            if (code == DBCINV)
190                                return 1;
191                        } else
192                            insize = 2;
193                    }
194                }
195            }
196            else TRYMAP_ENC(jisxcommon, code, c);
197            else if (c >= 0xff61 && c <= 0xff9f) {
198                /* JIS X 0201 half-width katakana */
199                WRITE2(0x8e, c - 0xfec0)
200                NEXT(1, 2)
201                continue;
202            }
203            else if (c == 0xff3c)
204                /* F/W REVERSE SOLIDUS (see NOTES) */
205                code = 0x2140;
206            else if (c == 0xff5e)
207                /* F/W TILDE (see NOTES) */
208                code = 0x2232;
209            else
210                return 1;
211        }
212        else if (c >> 16 == EMPBASE >> 16) {
213            EMULATE_JISX0213_2000_ENCODE_EMP(code, c)
214            else TRYMAP_ENC(jisx0213_emp, code, c & 0xffff);
215            else return insize;
216        }
217        else
218            return insize;
219
220        if (code & 0x8000) {
221            /* Codeset 2 */
222            WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
223            NEXT(insize, 3)
224        } else {
225            /* Codeset 1 */
226            WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
227            NEXT(insize, 2)
228        }
229    }
230
231    return 0;
232}
233
234DECODER(euc_jis_2004)
235{
236    while (inleft > 0) {
237        unsigned char c = IN1;
238        ucs4_t code;
239
240        REQUIRE_OUTBUF(1)
241
242        if (c < 0x80) {
243            OUT1(c)
244            NEXT(1, 1)
245            continue;
246        }
247
248        if (c == 0x8e) {
249            /* JIS X 0201 half-width katakana */
250            unsigned char c2;
251
252            REQUIRE_INBUF(2)
253            c2 = IN2;
254            if (c2 >= 0xa1 && c2 <= 0xdf) {
255                OUT1(0xfec0 + c2)
256                NEXT(2, 1)
257            }
258            else
259                return 2;
260        }
261        else if (c == 0x8f) {
262            unsigned char c2, c3;
263
264            REQUIRE_INBUF(3)
265            c2 = IN2 ^ 0x80;
266            c3 = IN3 ^ 0x80;
267
268            /* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */
269            EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, c2, c3)
270            else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ;
271            else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) {
272                WRITEUCS4(EMPBASE | code)
273                NEXT_IN(3)
274                continue;
275            }
276            else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
277            else return 3;
278            NEXT(3, 1)
279        }
280        else {
281            unsigned char c2;
282
283            REQUIRE_INBUF(2)
284            c ^= 0x80;
285            c2 = IN2 ^ 0x80;
286
287            /* JIS X 0213 Plane 1 */
288            EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, c, c2)
289            else if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c;
290            else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e;
291            else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
292            else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2);
293            else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) {
294                WRITEUCS4(EMPBASE | code)
295                NEXT_IN(2)
296                continue;
297            }
298            else TRYMAP_DEC(jisx0213_pair, code, c, c2) {
299                WRITE2(code >> 16, code & 0xffff)
300                NEXT(2, 2)
301                continue;
302            }
303            else return 2;
304            NEXT(2, 1)
305        }
306    }
307
308    return 0;
309}
310
311
312/*
313 * EUC-JP codec
314 */
315
316ENCODER(euc_jp)
317{
318    while (inleft > 0) {
319        Py_UNICODE c = IN1;
320        DBCHAR code;
321
322        if (c < 0x80) {
323            WRITE1((unsigned char)c)
324            NEXT(1, 1)
325            continue;
326        }
327
328        UCS4INVALID(c)
329
330        TRYMAP_ENC(jisxcommon, code, c);
331        else if (c >= 0xff61 && c <= 0xff9f) {
332            /* JIS X 0201 half-width katakana */
333            WRITE2(0x8e, c - 0xfec0)
334            NEXT(1, 2)
335            continue;
336        }
337#ifndef STRICT_BUILD
338        else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */
339            code = 0x2140;
340        else if (c == 0xa5) { /* YEN SIGN */
341            WRITE1(0x5c);
342            NEXT(1, 1)
343            continue;
344        } else if (c == 0x203e) { /* OVERLINE */
345            WRITE1(0x7e);
346            NEXT(1, 1)
347            continue;
348        }
349#endif
350        else
351            return 1;
352
353        if (code & 0x8000) {
354            /* JIS X 0212 */
355            WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
356            NEXT(1, 3)
357        } else {
358            /* JIS X 0208 */
359            WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
360            NEXT(1, 2)
361        }
362    }
363
364    return 0;
365}
366
367DECODER(euc_jp)
368{
369    while (inleft > 0) {
370        unsigned char c = IN1;
371
372        REQUIRE_OUTBUF(1)
373
374        if (c < 0x80) {
375            OUT1(c)
376            NEXT(1, 1)
377            continue;
378        }
379
380        if (c == 0x8e) {
381            /* JIS X 0201 half-width katakana */
382            unsigned char c2;
383
384            REQUIRE_INBUF(2)
385            c2 = IN2;
386            if (c2 >= 0xa1 && c2 <= 0xdf) {
387                OUT1(0xfec0 + c2)
388                NEXT(2, 1)
389            }
390            else
391                return 2;
392        }
393        else if (c == 0x8f) {
394            unsigned char c2, c3;
395
396            REQUIRE_INBUF(3)
397            c2 = IN2;
398            c3 = IN3;
399            /* JIS X 0212 */
400            TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) {
401                NEXT(3, 1)
402            }
403            else
404                return 3;
405        }
406        else {
407            unsigned char c2;
408
409            REQUIRE_INBUF(2)
410            c2 = IN2;
411            /* JIS X 0208 */
412#ifndef STRICT_BUILD
413            if (c == 0xa1 && c2 == 0xc0)
414                /* FULL-WIDTH REVERSE SOLIDUS */
415                **outbuf = 0xff3c;
416            else
417#endif
418                TRYMAP_DEC(jisx0208, **outbuf,
419                           c ^ 0x80, c2 ^ 0x80) ;
420            else return 2;
421            NEXT(2, 1)
422        }
423    }
424
425    return 0;
426}
427
428
429/*
430 * SHIFT_JIS codec
431 */
432
433ENCODER(shift_jis)
434{
435    while (inleft > 0) {
436        Py_UNICODE c = IN1;
437        DBCHAR code;
438        unsigned char c1, c2;
439
440#ifdef STRICT_BUILD
441        JISX0201_R_ENCODE(c, code)
442#else
443        if (c < 0x80) code = c;
444        else if (c == 0x00a5) code = 0x5c; /* YEN SIGN */
445        else if (c == 0x203e) code = 0x7e; /* OVERLINE */
446#endif
447        else JISX0201_K_ENCODE(c, code)
448        else UCS4INVALID(c)
449        else code = NOCHAR;
450
451        if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
452            REQUIRE_OUTBUF(1)
453
454            OUT1((unsigned char)code)
455            NEXT(1, 1)
456            continue;
457        }
458
459        REQUIRE_OUTBUF(2)
460
461        if (code == NOCHAR) {
462            TRYMAP_ENC(jisxcommon, code, c);
463#ifndef STRICT_BUILD
464            else if (c == 0xff3c)
465                code = 0x2140; /* FULL-WIDTH REVERSE SOLIDUS */
466#endif
467            else
468                return 1;
469
470            if (code & 0x8000) /* MSB set: JIS X 0212 */
471                return 1;
472        }
473
474        c1 = code >> 8;
475        c2 = code & 0xff;
476        c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
477        c1 = (c1 - 0x21) >> 1;
478        OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
479        OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
480        NEXT(1, 2)
481    }
482
483    return 0;
484}
485
486DECODER(shift_jis)
487{
488    while (inleft > 0) {
489        unsigned char c = IN1;
490
491        REQUIRE_OUTBUF(1)
492
493#ifdef STRICT_BUILD
494        JISX0201_R_DECODE(c, **outbuf)
495#else
496        if (c < 0x80) **outbuf = c;
497#endif
498        else JISX0201_K_DECODE(c, **outbuf)
499        else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
500            unsigned char c1, c2;
501
502            REQUIRE_INBUF(2)
503            c2 = IN2;
504            if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
505                return 2;
506
507            c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
508            c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
509            c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1) + 0x21);
510            c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
511
512#ifndef STRICT_BUILD
513            if (c1 == 0x21 && c2 == 0x40) {
514                /* FULL-WIDTH REVERSE SOLIDUS */
515                OUT1(0xff3c)
516                NEXT(2, 1)
517                continue;
518            }
519#endif
520            TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
521                NEXT(2, 1)
522                continue;
523            }
524            else
525                return 2;
526        }
527        else
528            return 2;
529
530        NEXT(1, 1) /* JIS X 0201 */
531    }
532
533    return 0;
534}
535
536
537/*
538 * SHIFT_JIS-2004 codec
539 */
540
541ENCODER(shift_jis_2004)
542{
543    while (inleft > 0) {
544        ucs4_t c = IN1;
545        DBCHAR code = NOCHAR;
546        int c1, c2;
547        Py_ssize_t insize;
548
549        JISX0201_ENCODE(c, code)
550        else DECODE_SURROGATE(c)
551
552        if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
553            WRITE1((unsigned char)code)
554            NEXT(1, 1)
555            continue;
556        }
557
558        REQUIRE_OUTBUF(2)
559        insize = GET_INSIZE(c);
560
561        if (code == NOCHAR) {
562            if (c <= 0xffff) {
563                EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
564                else TRYMAP_ENC(jisx0213_bmp, code, c) {
565                    if (code == MULTIC) {
566                        if (inleft < 2) {
567                            if (flags & MBENC_FLUSH) {
568                            code = find_pairencmap
569                                ((ucs2_t)c, 0,
570                              jisx0213_pair_encmap,
571                                JISX0213_ENCPAIRS);
572                            if (code == DBCINV)
573                                return 1;
574                            }
575                            else
576                                return MBERR_TOOFEW;
577                        }
578                        else {
579                            code = find_pairencmap(
580                                (ucs2_t)c, IN2,
581                              jisx0213_pair_encmap,
582                                JISX0213_ENCPAIRS);
583                            if (code == DBCINV) {
584                            code = find_pairencmap(
585                                (ucs2_t)c, 0,
586                              jisx0213_pair_encmap,
587                                JISX0213_ENCPAIRS);
588                            if (code == DBCINV)
589                                return 1;
590                            }
591                            else
592                                insize = 2;
593                        }
594                    }
595                }
596                else TRYMAP_ENC(jisxcommon, code, c) {
597                    /* abandon JIS X 0212 codes */
598                    if (code & 0x8000)
599                        return 1;
600                }
601                else return 1;
602            }
603            else if (c >> 16 == EMPBASE >> 16) {
604                EMULATE_JISX0213_2000_ENCODE_EMP(code, c)
605                else TRYMAP_ENC(jisx0213_emp, code, c&0xffff);
606                else return insize;
607            }
608            else
609                return insize;
610        }
611
612        c1 = code >> 8;
613        c2 = (code & 0xff) - 0x21;
614
615        if (c1 & 0x80) { /* Plane 2 */
616            if (c1 >= 0xee) c1 -= 0x87;
617            else if (c1 >= 0xac || c1 == 0xa8) c1 -= 0x49;
618            else c1 -= 0x43;
619        }
620        else /* Plane 1 */
621            c1 -= 0x21;
622
623        if (c1 & 1) c2 += 0x5e;
624        c1 >>= 1;
625        OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1))
626        OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41))
627
628        NEXT(insize, 2)
629    }
630
631    return 0;
632}
633
634DECODER(shift_jis_2004)
635{
636    while (inleft > 0) {
637        unsigned char c = IN1;
638
639        REQUIRE_OUTBUF(1)
640        JISX0201_DECODE(c, **outbuf)
641        else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){
642            unsigned char c1, c2;
643            ucs4_t code;
644
645            REQUIRE_INBUF(2)
646            c2 = IN2;
647            if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
648                return 2;
649
650            c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
651            c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
652            c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1));
653            c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
654
655            if (c1 < 0x5e) { /* Plane 1 */
656                c1 += 0x21;
657                EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf,
658                                c1, c2)
659                else TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
660                    NEXT_OUT(1)
661                }
662                else TRYMAP_DEC(jisx0213_1_bmp, **outbuf,
663                                c1, c2) {
664                    NEXT_OUT(1)
665                }
666                else TRYMAP_DEC(jisx0213_1_emp, code, c1, c2) {
667                    WRITEUCS4(EMPBASE | code)
668                }
669                else TRYMAP_DEC(jisx0213_pair, code, c1, c2) {
670                    WRITE2(code >> 16, code & 0xffff)
671                    NEXT_OUT(2)
672                }
673                else
674                    return 2;
675                NEXT_IN(2)
676            }
677            else { /* Plane 2 */
678                if (c1 >= 0x67) c1 += 0x07;
679                else if (c1 >= 0x63 || c1 == 0x5f) c1 -= 0x37;
680                else c1 -= 0x3d;
681
682                EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf,
683                                c1, c2)
684                else TRYMAP_DEC(jisx0213_2_bmp, **outbuf,
685                                c1, c2) ;
686                else TRYMAP_DEC(jisx0213_2_emp, code, c1, c2) {
687                    WRITEUCS4(EMPBASE | code)
688                    NEXT_IN(2)
689                    continue;
690                }
691                else
692                    return 2;
693                NEXT(2, 1)
694            }
695            continue;
696        }
697        else
698            return 2;
699
700        NEXT(1, 1) /* JIS X 0201 */
701    }
702
703    return 0;
704}
705
706
707BEGIN_MAPPINGS_LIST
708  MAPPING_DECONLY(jisx0208)
709  MAPPING_DECONLY(jisx0212)
710  MAPPING_ENCONLY(jisxcommon)
711  MAPPING_DECONLY(jisx0213_1_bmp)
712  MAPPING_DECONLY(jisx0213_2_bmp)
713  MAPPING_ENCONLY(jisx0213_bmp)
714  MAPPING_DECONLY(jisx0213_1_emp)
715  MAPPING_DECONLY(jisx0213_2_emp)
716  MAPPING_ENCONLY(jisx0213_emp)
717  MAPPING_ENCDEC(jisx0213_pair)
718  MAPPING_ENCDEC(cp932ext)
719END_MAPPINGS_LIST
720
721BEGIN_CODECS_LIST
722  CODEC_STATELESS(shift_jis)
723  CODEC_STATELESS(cp932)
724  CODEC_STATELESS(euc_jp)
725  CODEC_STATELESS(shift_jis_2004)
726  CODEC_STATELESS(euc_jis_2004)
727  { "euc_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(euc_jis_2004) },
728  { "shift_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(shift_jis_2004) },
729END_CODECS_LIST
730
731I_AM_A_MODULE_FOR(jp)
732