1/* 2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings 3 * 4 * Written by Hye-Shik Chang <perky@FreeBSD.org> 5 */ 6 7#include "cjkcodecs.h" 8#include "mappings_cn.h" 9 10/** 11 * hz is predefined as 100 on AIX. So we undefine it to avoid 12 * conflict against hz codec's. 13 */ 14#ifdef _AIX 15#undef hz 16#endif 17 18/* GBK and GB2312 map differently in few code points that are listed below: 19 * 20 * gb2312 gbk 21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT 22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH 23 * A844 undefined U+2015 HORIZONTAL BAR 24 */ 25 26#define GBK_DECODE(dc1, dc2, assi) \ 27 if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \ 28 else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \ 29 else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \ 30 else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \ 31 else TRYMAP_DEC(gbkext, assi, dc1, dc2); 32 33#define GBK_ENCODE(code, assi) \ 34 if ((code) == 0x2014) (assi) = 0xa1aa; \ 35 else if ((code) == 0x2015) (assi) = 0xa844; \ 36 else if ((code) == 0x00b7) (assi) = 0xa1a4; \ 37 else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code)); 38 39/* 40 * GB2312 codec 41 */ 42 43ENCODER(gb2312) 44{ 45 while (inleft > 0) { 46 Py_UNICODE c = IN1; 47 DBCHAR code; 48 49 if (c < 0x80) { 50 WRITE1((unsigned char)c) 51 NEXT(1, 1) 52 continue; 53 } 54 UCS4INVALID(c) 55 56 REQUIRE_OUTBUF(2) 57 TRYMAP_ENC(gbcommon, code, c); 58 else return 1; 59 60 if (code & 0x8000) /* MSB set: GBK */ 61 return 1; 62 63 OUT1((code >> 8) | 0x80) 64 OUT2((code & 0xFF) | 0x80) 65 NEXT(1, 2) 66 } 67 68 return 0; 69} 70 71DECODER(gb2312) 72{ 73 while (inleft > 0) { 74 unsigned char c = **inbuf; 75 76 REQUIRE_OUTBUF(1) 77 78 if (c < 0x80) { 79 OUT1(c) 80 NEXT(1, 1) 81 continue; 82 } 83 84 REQUIRE_INBUF(2) 85 TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) { 86 NEXT(2, 1) 87 } 88 else return 2; 89 } 90 91 return 0; 92} 93 94 95/* 96 * GBK codec 97 */ 98 99ENCODER(gbk) 100{ 101 while (inleft > 0) { 102 Py_UNICODE c = IN1; 103 DBCHAR code; 104 105 if (c < 0x80) { 106 WRITE1((unsigned char)c) 107 NEXT(1, 1) 108 continue; 109 } 110 UCS4INVALID(c) 111 112 REQUIRE_OUTBUF(2) 113 114 GBK_ENCODE(c, code) 115 else return 1; 116 117 OUT1((code >> 8) | 0x80) 118 if (code & 0x8000) 119 OUT2((code & 0xFF)) /* MSB set: GBK */ 120 else 121 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ 122 NEXT(1, 2) 123 } 124 125 return 0; 126} 127 128DECODER(gbk) 129{ 130 while (inleft > 0) { 131 unsigned char c = IN1; 132 133 REQUIRE_OUTBUF(1) 134 135 if (c < 0x80) { 136 OUT1(c) 137 NEXT(1, 1) 138 continue; 139 } 140 141 REQUIRE_INBUF(2) 142 143 GBK_DECODE(c, IN2, **outbuf) 144 else return 2; 145 146 NEXT(2, 1) 147 } 148 149 return 0; 150} 151 152 153/* 154 * GB18030 codec 155 */ 156 157ENCODER(gb18030) 158{ 159 while (inleft > 0) { 160 ucs4_t c = IN1; 161 DBCHAR code; 162 163 if (c < 0x80) { 164 WRITE1(c) 165 NEXT(1, 1) 166 continue; 167 } 168 169 DECODE_SURROGATE(c) 170 if (c > 0x10FFFF) 171#if Py_UNICODE_SIZE == 2 172 return 2; /* surrogates pair */ 173#else 174 return 1; 175#endif 176 else if (c >= 0x10000) { 177 ucs4_t tc = c - 0x10000; 178 179 REQUIRE_OUTBUF(4) 180 181 OUT4((unsigned char)(tc % 10) + 0x30) 182 tc /= 10; 183 OUT3((unsigned char)(tc % 126) + 0x81) 184 tc /= 126; 185 OUT2((unsigned char)(tc % 10) + 0x30) 186 tc /= 10; 187 OUT1((unsigned char)(tc + 0x90)) 188 189#if Py_UNICODE_SIZE == 2 190 NEXT(2, 4) /* surrogates pair */ 191#else 192 NEXT(1, 4) 193#endif 194 continue; 195 } 196 197 REQUIRE_OUTBUF(2) 198 199 GBK_ENCODE(c, code) 200 else TRYMAP_ENC(gb18030ext, code, c); 201 else { 202 const struct _gb18030_to_unibmp_ranges *utrrange; 203 204 REQUIRE_OUTBUF(4) 205 206 for (utrrange = gb18030_to_unibmp_ranges; 207 utrrange->first != 0; 208 utrrange++) 209 if (utrrange->first <= c && 210 c <= utrrange->last) { 211 Py_UNICODE tc; 212 213 tc = c - utrrange->first + 214 utrrange->base; 215 216 OUT4((unsigned char)(tc % 10) + 0x30) 217 tc /= 10; 218 OUT3((unsigned char)(tc % 126) + 0x81) 219 tc /= 126; 220 OUT2((unsigned char)(tc % 10) + 0x30) 221 tc /= 10; 222 OUT1((unsigned char)tc + 0x81) 223 224 NEXT(1, 4) 225 break; 226 } 227 228 if (utrrange->first == 0) 229 return 1; 230 continue; 231 } 232 233 OUT1((code >> 8) | 0x80) 234 if (code & 0x8000) 235 OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */ 236 else 237 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ 238 239 NEXT(1, 2) 240 } 241 242 return 0; 243} 244 245DECODER(gb18030) 246{ 247 while (inleft > 0) { 248 unsigned char c = IN1, c2; 249 250 REQUIRE_OUTBUF(1) 251 252 if (c < 0x80) { 253 OUT1(c) 254 NEXT(1, 1) 255 continue; 256 } 257 258 REQUIRE_INBUF(2) 259 260 c2 = IN2; 261 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */ 262 const struct _gb18030_to_unibmp_ranges *utr; 263 unsigned char c3, c4; 264 ucs4_t lseq; 265 266 REQUIRE_INBUF(4) 267 c3 = IN3; 268 c4 = IN4; 269 if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39) 270 return 4; 271 c -= 0x81; c2 -= 0x30; 272 c3 -= 0x81; c4 -= 0x30; 273 274 if (c < 4) { /* U+0080 - U+FFFF */ 275 lseq = ((ucs4_t)c * 10 + c2) * 1260 + 276 (ucs4_t)c3 * 10 + c4; 277 if (lseq < 39420) { 278 for (utr = gb18030_to_unibmp_ranges; 279 lseq >= (utr + 1)->base; 280 utr++) ; 281 OUT1(utr->first - utr->base + lseq) 282 NEXT(4, 1) 283 continue; 284 } 285 } 286 else if (c >= 15) { /* U+10000 - U+10FFFF */ 287 lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2) 288 * 1260 + (ucs4_t)c3 * 10 + c4; 289 if (lseq <= 0x10FFFF) { 290 WRITEUCS4(lseq); 291 NEXT_IN(4) 292 continue; 293 } 294 } 295 return 4; 296 } 297 298 GBK_DECODE(c, c2, **outbuf) 299 else TRYMAP_DEC(gb18030ext, **outbuf, c, c2); 300 else return 2; 301 302 NEXT(2, 1) 303 } 304 305 return 0; 306} 307 308 309/* 310 * HZ codec 311 */ 312 313ENCODER_INIT(hz) 314{ 315 state->i = 0; 316 return 0; 317} 318 319ENCODER_RESET(hz) 320{ 321 if (state->i != 0) { 322 WRITE2('~', '}') 323 state->i = 0; 324 NEXT_OUT(2) 325 } 326 return 0; 327} 328 329ENCODER(hz) 330{ 331 while (inleft > 0) { 332 Py_UNICODE c = IN1; 333 DBCHAR code; 334 335 if (c < 0x80) { 336 if (state->i == 0) { 337 WRITE1((unsigned char)c) 338 NEXT(1, 1) 339 } 340 else { 341 WRITE3('~', '}', (unsigned char)c) 342 NEXT(1, 3) 343 state->i = 0; 344 } 345 continue; 346 } 347 348 UCS4INVALID(c) 349 350 TRYMAP_ENC(gbcommon, code, c); 351 else return 1; 352 353 if (code & 0x8000) /* MSB set: GBK */ 354 return 1; 355 356 if (state->i == 0) { 357 WRITE4('~', '{', code >> 8, code & 0xff) 358 NEXT(1, 4) 359 state->i = 1; 360 } 361 else { 362 WRITE2(code >> 8, code & 0xff) 363 NEXT(1, 2) 364 } 365 } 366 367 return 0; 368} 369 370DECODER_INIT(hz) 371{ 372 state->i = 0; 373 return 0; 374} 375 376DECODER_RESET(hz) 377{ 378 state->i = 0; 379 return 0; 380} 381 382DECODER(hz) 383{ 384 while (inleft > 0) { 385 unsigned char c = IN1; 386 387 if (c == '~') { 388 unsigned char c2 = IN2; 389 390 REQUIRE_INBUF(2) 391 if (c2 == '~') { 392 WRITE1('~') 393 NEXT(2, 1) 394 continue; 395 } 396 else if (c2 == '{' && state->i == 0) 397 state->i = 1; /* set GB */ 398 else if (c2 == '}' && state->i == 1) 399 state->i = 0; /* set ASCII */ 400 else if (c2 == '\n') 401 ; /* line-continuation */ 402 else 403 return 2; 404 NEXT(2, 0); 405 continue; 406 } 407 408 if (c & 0x80) 409 return 1; 410 411 if (state->i == 0) { /* ASCII mode */ 412 WRITE1(c) 413 NEXT(1, 1) 414 } 415 else { /* GB mode */ 416 REQUIRE_INBUF(2) 417 REQUIRE_OUTBUF(1) 418 TRYMAP_DEC(gb2312, **outbuf, c, IN2) { 419 NEXT(2, 1) 420 } 421 else 422 return 2; 423 } 424 } 425 426 return 0; 427} 428 429 430BEGIN_MAPPINGS_LIST 431 MAPPING_DECONLY(gb2312) 432 MAPPING_DECONLY(gbkext) 433 MAPPING_ENCONLY(gbcommon) 434 MAPPING_ENCDEC(gb18030ext) 435END_MAPPINGS_LIST 436 437BEGIN_CODECS_LIST 438 CODEC_STATELESS(gb2312) 439 CODEC_STATELESS(gbk) 440 CODEC_STATELESS(gb18030) 441 CODEC_STATEFUL(hz) 442END_CODECS_LIST 443 444I_AM_A_MODULE_FOR(cn) 445