1/*
2 * _codecs_kr.c: Codecs collection for Korean encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7#include "cjkcodecs.h"
8#include "mappings_kr.h"
9
10/*
11 * EUC-KR codec
12 */
13
14#define EUCKR_JAMO_FIRSTBYTE    0xA4
15#define EUCKR_JAMO_FILLER       0xD4
16
17static const unsigned char u2cgk_choseong[19] = {
18    0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
19    0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
20    0xbc, 0xbd, 0xbe
21};
22static const unsigned char u2cgk_jungseong[21] = {
23    0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
24    0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
25    0xcf, 0xd0, 0xd1, 0xd2, 0xd3
26};
27static const unsigned char u2cgk_jongseong[28] = {
28    0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
29    0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
30    0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
31    0xbb, 0xbc, 0xbd, 0xbe
32};
33
34ENCODER(euc_kr)
35{
36    while (*inpos < inlen) {
37        Py_UCS4 c = INCHAR1;
38        DBCHAR code;
39
40        if (c < 0x80) {
41            WRITEBYTE1((unsigned char)c);
42            NEXT(1, 1);
43            continue;
44        }
45
46        if (c > 0xFFFF)
47            return 1;
48
49        REQUIRE_OUTBUF(2);
50        if (TRYMAP_ENC(cp949, code, c))
51            ;
52        else
53            return 1;
54
55        if ((code & 0x8000) == 0) {
56            /* KS X 1001 coded character */
57            OUTBYTE1((code >> 8) | 0x80);
58            OUTBYTE2((code & 0xFF) | 0x80);
59            NEXT(1, 2);
60        }
61        else {
62            /* Mapping is found in CP949 extension,
63               but we encode it in KS X 1001:1998 Annex 3,
64               make-up sequence for EUC-KR. */
65
66            REQUIRE_OUTBUF(8);
67
68            /* syllable composition precedence */
69            OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
70            OUTBYTE2(EUCKR_JAMO_FILLER);
71
72            /* All code points in CP949 extension are in unicode
73             * Hangul Syllable area. */
74            assert(0xac00 <= c && c <= 0xd7a3);
75            c -= 0xac00;
76
77            OUTBYTE3(EUCKR_JAMO_FIRSTBYTE);
78            OUTBYTE4(u2cgk_choseong[c / 588]);
79            NEXT_OUT(4);
80
81            OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
82            OUTBYTE2(u2cgk_jungseong[(c / 28) % 21]);
83            OUTBYTE3(EUCKR_JAMO_FIRSTBYTE);
84            OUTBYTE4(u2cgk_jongseong[c % 28]);
85            NEXT(1, 4);
86        }
87    }
88
89    return 0;
90}
91
92#define NONE    127
93
94static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
95       0,    1, NONE,    2, NONE, NONE,    3,    4,
96       5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
97       6,    7,    8, NONE,    9,   10,   11,   12,
98      13,   14,   15,   16,   17,   18
99};
100static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
101       1,    2,    3,    4,    5,    6,    7, NONE,
102       8,    9,   10,   11,   12,   13,   14,   15,
103      16,   17, NONE,   18,   19,   20,   21,   22,
104    NONE,   23,   24,   25,   26,   27
105};
106
107DECODER(euc_kr)
108{
109    while (inleft > 0) {
110        unsigned char c = INBYTE1;
111        Py_UCS4 decoded;
112
113        if (c < 0x80) {
114            OUTCHAR(c);
115            NEXT_IN(1);
116            continue;
117        }
118
119        REQUIRE_INBUF(2);
120
121        if (c == EUCKR_JAMO_FIRSTBYTE &&
122            INBYTE2 == EUCKR_JAMO_FILLER) {
123            /* KS X 1001:1998 Annex 3 make-up sequence */
124            DBCHAR cho, jung, jong;
125
126            REQUIRE_INBUF(8);
127            if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
128                (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
129                (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
130                return 1;
131
132            c = (*inbuf)[3];
133            if (0xa1 <= c && c <= 0xbe)
134                cho = cgk2u_choseong[c - 0xa1];
135            else
136                cho = NONE;
137
138            c = (*inbuf)[5];
139            jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
140
141            c = (*inbuf)[7];
142            if (c == EUCKR_JAMO_FILLER)
143                jong = 0;
144            else if (0xa1 <= c && c <= 0xbe)
145                jong = cgk2u_jongseong[c - 0xa1];
146            else
147                jong = NONE;
148
149            if (cho == NONE || jung == NONE || jong == NONE)
150                return 1;
151
152            OUTCHAR(0xac00 + cho*588 + jung*28 + jong);
153            NEXT_IN(8);
154        }
155        else if (TRYMAP_DEC(ksx1001, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
156            OUTCHAR(decoded);
157            NEXT_IN(2);
158        }
159        else
160            return 1;
161    }
162
163    return 0;
164}
165#undef NONE
166
167
168/*
169 * CP949 codec
170 */
171
172ENCODER(cp949)
173{
174    while (*inpos < inlen) {
175        Py_UCS4 c = INCHAR1;
176        DBCHAR code;
177
178        if (c < 0x80) {
179            WRITEBYTE1((unsigned char)c);
180            NEXT(1, 1);
181            continue;
182        }
183
184        if (c > 0xFFFF)
185            return 1;
186
187        REQUIRE_OUTBUF(2);
188        if (TRYMAP_ENC(cp949, code, c))
189            ;
190        else
191            return 1;
192
193        OUTBYTE1((code >> 8) | 0x80);
194        if (code & 0x8000)
195            OUTBYTE2(code & 0xFF); /* MSB set: CP949 */
196        else
197            OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: ks x 1001 */
198        NEXT(1, 2);
199    }
200
201    return 0;
202}
203
204DECODER(cp949)
205{
206    while (inleft > 0) {
207        unsigned char c = INBYTE1;
208        Py_UCS4 decoded;
209
210        if (c < 0x80) {
211            OUTCHAR(c);
212            NEXT_IN(1);
213            continue;
214        }
215
216        REQUIRE_INBUF(2);
217        if (TRYMAP_DEC(ksx1001, decoded, c ^ 0x80, INBYTE2 ^ 0x80))
218            OUTCHAR(decoded);
219        else if (TRYMAP_DEC(cp949ext, decoded, c, INBYTE2))
220            OUTCHAR(decoded);
221        else
222            return 1;
223
224        NEXT_IN(2);
225    }
226
227    return 0;
228}
229
230
231/*
232 * JOHAB codec
233 */
234
235static const unsigned char u2johabidx_choseong[32] = {
236                0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
237    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
238    0x10, 0x11, 0x12, 0x13, 0x14,
239};
240static const unsigned char u2johabidx_jungseong[32] = {
241                      0x03, 0x04, 0x05, 0x06, 0x07,
242                0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
243                0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
244                0x1a, 0x1b, 0x1c, 0x1d,
245};
246static const unsigned char u2johabidx_jongseong[32] = {
247          0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
248    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
249    0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
250    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
251};
252static const DBCHAR u2johabjamo[] = {
253            0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441,
254    0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f,
255    0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441,
256    0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461,
257    0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1,
258    0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
259    0x8741, 0x8761, 0x8781, 0x87a1,
260};
261
262ENCODER(johab)
263{
264    while (*inpos < inlen) {
265        Py_UCS4 c = INCHAR1;
266        DBCHAR code;
267
268        if (c < 0x80) {
269            WRITEBYTE1((unsigned char)c);
270            NEXT(1, 1);
271            continue;
272        }
273
274        if (c > 0xFFFF)
275            return 1;
276
277        REQUIRE_OUTBUF(2);
278
279        if (c >= 0xac00 && c <= 0xd7a3) {
280            c -= 0xac00;
281            code = 0x8000 |
282                (u2johabidx_choseong[c / 588] << 10) |
283                (u2johabidx_jungseong[(c / 28) % 21] << 5) |
284                u2johabidx_jongseong[c % 28];
285        }
286        else if (c >= 0x3131 && c <= 0x3163)
287            code = u2johabjamo[c - 0x3131];
288        else if (TRYMAP_ENC(cp949, code, c)) {
289            unsigned char c1, c2, t2;
290            unsigned short t1;
291
292            assert((code & 0x8000) == 0);
293            c1 = code >> 8;
294            c2 = code & 0xff;
295            if (((c1 >= 0x21 && c1 <= 0x2c) ||
296                (c1 >= 0x4a && c1 <= 0x7d)) &&
297                (c2 >= 0x21 && c2 <= 0x7e)) {
298                t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
299                          (c1 - 0x21 + 0x197));
300                t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
301                OUTBYTE1(t1 >> 1);
302                OUTBYTE2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43);
303                NEXT(1, 2);
304                continue;
305            }
306            else
307                return 1;
308        }
309        else
310            return 1;
311
312        OUTBYTE1(code >> 8);
313        OUTBYTE2(code & 0xff);
314        NEXT(1, 2);
315    }
316
317    return 0;
318}
319
320#define FILL 0xfd
321#define NONE 0xff
322
323static const unsigned char johabidx_choseong[32] = {
324    NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
325    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
326    0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE,
327    NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
328};
329static const unsigned char johabidx_jungseong[32] = {
330    NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04,
331    NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
332    NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
333    NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE,
334};
335static const unsigned char johabidx_jongseong[32] = {
336    NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
337    0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
338    0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15,
339    0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE,
340};
341
342static const unsigned char johabjamo_choseong[32] = {
343    NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39,
344    0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49,
345    0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE,
346    NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
347};
348static const unsigned char johabjamo_jungseong[32] = {
349    NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53,
350    NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
351    NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
352    NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE,
353};
354static const unsigned char johabjamo_jongseong[32] = {
355    NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
356    0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
357    0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47,
358    0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE,
359};
360
361DECODER(johab)
362{
363    while (inleft > 0) {
364        unsigned char c = INBYTE1, c2;
365        Py_UCS4 decoded;
366
367        if (c < 0x80) {
368            OUTCHAR(c);
369            NEXT_IN(1);
370            continue;
371        }
372
373        REQUIRE_INBUF(2);
374        c2 = INBYTE2;
375
376        if (c < 0xd8) {
377            /* johab hangul */
378            unsigned char c_cho, c_jung, c_jong;
379            unsigned char i_cho, i_jung, i_jong;
380
381            c_cho = (c >> 2) & 0x1f;
382            c_jung = ((c << 3) | c2 >> 5) & 0x1f;
383            c_jong = c2 & 0x1f;
384
385            i_cho = johabidx_choseong[c_cho];
386            i_jung = johabidx_jungseong[c_jung];
387            i_jong = johabidx_jongseong[c_jong];
388
389            if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
390                return 1;
391
392            /* we don't use U+1100 hangul jamo yet. */
393            if (i_cho == FILL) {
394                if (i_jung == FILL) {
395                    if (i_jong == FILL)
396                        OUTCHAR(0x3000);
397                    else
398                        OUTCHAR(0x3100 |
399                            johabjamo_jongseong[c_jong]);
400                }
401                else {
402                    if (i_jong == FILL)
403                        OUTCHAR(0x3100 |
404                            johabjamo_jungseong[c_jung]);
405                    else
406                        return 1;
407                }
408            } else {
409                if (i_jung == FILL) {
410                    if (i_jong == FILL)
411                        OUTCHAR(0x3100 |
412                            johabjamo_choseong[c_cho]);
413                    else
414                        return 1;
415                }
416                else
417                    OUTCHAR(0xac00 +
418                        i_cho * 588 +
419                        i_jung * 28 +
420                        (i_jong == FILL ? 0 : i_jong));
421            }
422            NEXT_IN(2);
423        } else {
424            /* KS X 1001 except hangul jamos and syllables */
425            if (c == 0xdf || c > 0xf9 ||
426                c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
427                (c2 & 0x7f) == 0x7f ||
428                (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
429                return 1;
430            else {
431                unsigned char t1, t2;
432
433                t1 = (c < 0xe0 ? 2 * (c - 0xd9) :
434                         2 * c - 0x197);
435                t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43);
436                t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
437                t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
438
439                if (TRYMAP_DEC(ksx1001, decoded, t1, t2)) {
440                    OUTCHAR(decoded);
441                    NEXT_IN(2);
442                }
443                else {
444                    return 1;
445                }
446            }
447        }
448    }
449
450    return 0;
451}
452#undef NONE
453#undef FILL
454
455
456BEGIN_MAPPINGS_LIST
457  MAPPING_DECONLY(ksx1001)
458  MAPPING_ENCONLY(cp949)
459  MAPPING_DECONLY(cp949ext)
460END_MAPPINGS_LIST
461
462BEGIN_CODECS_LIST
463  CODEC_STATELESS(euc_kr)
464  CODEC_STATELESS(cp949)
465  CODEC_STATELESS(johab)
466END_CODECS_LIST
467
468I_AM_A_MODULE_FOR(kr)
469