1/* 2 * multibytecodec.c: Common Multibyte Codec Implementation 3 * 4 * Written by Hye-Shik Chang <perky@FreeBSD.org> 5 */ 6 7#define PY_SSIZE_T_CLEAN 8#include "Python.h" 9#include "structmember.h" 10#include "multibytecodec.h" 11 12typedef struct { 13 const Py_UNICODE *inbuf, *inbuf_top, *inbuf_end; 14 unsigned char *outbuf, *outbuf_end; 15 PyObject *excobj, *outobj; 16} MultibyteEncodeBuffer; 17 18typedef struct { 19 const unsigned char *inbuf, *inbuf_top, *inbuf_end; 20 Py_UNICODE *outbuf, *outbuf_end; 21 PyObject *excobj, *outobj; 22} MultibyteDecodeBuffer; 23 24PyDoc_STRVAR(MultibyteCodec_Encode__doc__, 25"I.encode(unicode[, errors]) -> (string, length consumed)\n\ 26\n\ 27Return an encoded string version of `unicode'. errors may be given to\n\ 28set a different error handling scheme. Default is 'strict' meaning that\n\ 29encoding errors raise a UnicodeEncodeError. Other possible values are\n\ 30'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\ 31registered with codecs.register_error that can handle UnicodeEncodeErrors."); 32 33PyDoc_STRVAR(MultibyteCodec_Decode__doc__, 34"I.decode(string[, errors]) -> (unicodeobject, length consumed)\n\ 35\n\ 36Decodes `string' using I, an MultibyteCodec instance. errors may be given\n\ 37to set a different error handling scheme. Default is 'strict' meaning\n\ 38that encoding errors raise a UnicodeDecodeError. Other possible values\n\ 39are 'ignore' and 'replace' as well as any other name registered with\n\ 40codecs.register_error that is able to handle UnicodeDecodeErrors."); 41 42static char *codeckwarglist[] = {"input", "errors", NULL}; 43static char *incnewkwarglist[] = {"errors", NULL}; 44static char *incrementalkwarglist[] = {"input", "final", NULL}; 45static char *streamkwarglist[] = {"stream", "errors", NULL}; 46 47static PyObject *multibytecodec_encode(MultibyteCodec *, 48 MultibyteCodec_State *, const Py_UNICODE **, Py_ssize_t, 49 PyObject *, int); 50 51#define MBENC_RESET MBENC_MAX<<1 /* reset after an encoding session */ 52 53static PyObject * 54make_tuple(PyObject *object, Py_ssize_t len) 55{ 56 PyObject *v, *w; 57 58 if (object == NULL) 59 return NULL; 60 61 v = PyTuple_New(2); 62 if (v == NULL) { 63 Py_DECREF(object); 64 return NULL; 65 } 66 PyTuple_SET_ITEM(v, 0, object); 67 68 w = PyInt_FromSsize_t(len); 69 if (w == NULL) { 70 Py_DECREF(v); 71 return NULL; 72 } 73 PyTuple_SET_ITEM(v, 1, w); 74 75 return v; 76} 77 78static PyObject * 79internal_error_callback(const char *errors) 80{ 81 if (errors == NULL || strcmp(errors, "strict") == 0) 82 return ERROR_STRICT; 83 else if (strcmp(errors, "ignore") == 0) 84 return ERROR_IGNORE; 85 else if (strcmp(errors, "replace") == 0) 86 return ERROR_REPLACE; 87 else 88 return PyString_FromString(errors); 89} 90 91static PyObject * 92call_error_callback(PyObject *errors, PyObject *exc) 93{ 94 PyObject *args, *cb, *r; 95 96 assert(PyString_Check(errors)); 97 cb = PyCodec_LookupError(PyString_AS_STRING(errors)); 98 if (cb == NULL) 99 return NULL; 100 101 args = PyTuple_New(1); 102 if (args == NULL) { 103 Py_DECREF(cb); 104 return NULL; 105 } 106 107 PyTuple_SET_ITEM(args, 0, exc); 108 Py_INCREF(exc); 109 110 r = PyObject_CallObject(cb, args); 111 Py_DECREF(args); 112 Py_DECREF(cb); 113 return r; 114} 115 116static PyObject * 117codecctx_errors_get(MultibyteStatefulCodecContext *self) 118{ 119 const char *errors; 120 121 if (self->errors == ERROR_STRICT) 122 errors = "strict"; 123 else if (self->errors == ERROR_IGNORE) 124 errors = "ignore"; 125 else if (self->errors == ERROR_REPLACE) 126 errors = "replace"; 127 else { 128 Py_INCREF(self->errors); 129 return self->errors; 130 } 131 132 return PyString_FromString(errors); 133} 134 135static int 136codecctx_errors_set(MultibyteStatefulCodecContext *self, PyObject *value, 137 void *closure) 138{ 139 PyObject *cb; 140 141 if (!PyString_Check(value)) { 142 PyErr_SetString(PyExc_TypeError, "errors must be a string"); 143 return -1; 144 } 145 146 cb = internal_error_callback(PyString_AS_STRING(value)); 147 if (cb == NULL) 148 return -1; 149 150 ERROR_DECREF(self->errors); 151 self->errors = cb; 152 return 0; 153} 154 155/* This getset handlers list is used by all the stateful codec objects */ 156static PyGetSetDef codecctx_getsets[] = { 157 {"errors", (getter)codecctx_errors_get, 158 (setter)codecctx_errors_set, 159 PyDoc_STR("how to treat errors")}, 160 {NULL,} 161}; 162 163static int 164expand_encodebuffer(MultibyteEncodeBuffer *buf, Py_ssize_t esize) 165{ 166 Py_ssize_t orgpos, orgsize, incsize; 167 168 orgpos = (Py_ssize_t)((char *)buf->outbuf - 169 PyString_AS_STRING(buf->outobj)); 170 orgsize = PyString_GET_SIZE(buf->outobj); 171 incsize = (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize); 172 173 if (orgsize > PY_SSIZE_T_MAX - incsize) { 174 PyErr_NoMemory(); 175 return -1; 176 } 177 178 if (_PyString_Resize(&buf->outobj, orgsize + incsize) == -1) 179 return -1; 180 181 buf->outbuf = (unsigned char *)PyString_AS_STRING(buf->outobj) +orgpos; 182 buf->outbuf_end = (unsigned char *)PyString_AS_STRING(buf->outobj) 183 + PyString_GET_SIZE(buf->outobj); 184 185 return 0; 186} 187#define REQUIRE_ENCODEBUFFER(buf, s) do { \ 188 if ((s) < 0 || (s) > (buf)->outbuf_end - (buf)->outbuf) \ 189 if (expand_encodebuffer(buf, s) == -1) \ 190 goto errorexit; \ 191} while(0) 192 193static int 194expand_decodebuffer(MultibyteDecodeBuffer *buf, Py_ssize_t esize) 195{ 196 Py_ssize_t orgpos, orgsize; 197 198 orgpos = (Py_ssize_t)(buf->outbuf - PyUnicode_AS_UNICODE(buf->outobj)); 199 orgsize = PyUnicode_GET_SIZE(buf->outobj); 200 if (PyUnicode_Resize(&buf->outobj, orgsize + ( 201 esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1) 202 return -1; 203 204 buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj) + orgpos; 205 buf->outbuf_end = PyUnicode_AS_UNICODE(buf->outobj) 206 + PyUnicode_GET_SIZE(buf->outobj); 207 208 return 0; 209} 210#define REQUIRE_DECODEBUFFER(buf, s) do { \ 211 if ((s) < 0 || (s) > (buf)->outbuf_end - (buf)->outbuf) \ 212 if (expand_decodebuffer(buf, s) == -1) \ 213 goto errorexit; \ 214} while(0) 215 216 217/** 218 * MultibyteCodec object 219 */ 220 221static int 222multibytecodec_encerror(MultibyteCodec *codec, 223 MultibyteCodec_State *state, 224 MultibyteEncodeBuffer *buf, 225 PyObject *errors, Py_ssize_t e) 226{ 227 PyObject *retobj = NULL, *retstr = NULL, *tobj; 228 Py_ssize_t retstrsize, newpos; 229 Py_ssize_t esize, start, end; 230 const char *reason; 231 232 if (e > 0) { 233 reason = "illegal multibyte sequence"; 234 esize = e; 235 } 236 else { 237 switch (e) { 238 case MBERR_TOOSMALL: 239 REQUIRE_ENCODEBUFFER(buf, -1); 240 return 0; /* retry it */ 241 case MBERR_TOOFEW: 242 reason = "incomplete multibyte sequence"; 243 esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); 244 break; 245 case MBERR_INTERNAL: 246 PyErr_SetString(PyExc_RuntimeError, 247 "internal codec error"); 248 return -1; 249 default: 250 PyErr_SetString(PyExc_RuntimeError, 251 "unknown runtime error"); 252 return -1; 253 } 254 } 255 256 if (errors == ERROR_REPLACE) { 257 const Py_UNICODE replchar = '?', *inbuf = &replchar; 258 Py_ssize_t r; 259 260 for (;;) { 261 Py_ssize_t outleft; 262 263 outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); 264 r = codec->encode(state, codec->config, &inbuf, 1, 265 &buf->outbuf, outleft, 0); 266 if (r == MBERR_TOOSMALL) { 267 REQUIRE_ENCODEBUFFER(buf, -1); 268 continue; 269 } 270 else 271 break; 272 } 273 274 if (r != 0) { 275 REQUIRE_ENCODEBUFFER(buf, 1); 276 *buf->outbuf++ = '?'; 277 } 278 } 279 if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) { 280 buf->inbuf += esize; 281 return 0; 282 } 283 284 start = (Py_ssize_t)(buf->inbuf - buf->inbuf_top); 285 end = start + esize; 286 287 /* use cached exception object if available */ 288 if (buf->excobj == NULL) { 289 buf->excobj = PyUnicodeEncodeError_Create(codec->encoding, 290 buf->inbuf_top, 291 buf->inbuf_end - buf->inbuf_top, 292 start, end, reason); 293 if (buf->excobj == NULL) 294 goto errorexit; 295 } 296 else 297 if (PyUnicodeEncodeError_SetStart(buf->excobj, start) != 0 || 298 PyUnicodeEncodeError_SetEnd(buf->excobj, end) != 0 || 299 PyUnicodeEncodeError_SetReason(buf->excobj, reason) != 0) 300 goto errorexit; 301 302 if (errors == ERROR_STRICT) { 303 PyCodec_StrictErrors(buf->excobj); 304 goto errorexit; 305 } 306 307 retobj = call_error_callback(errors, buf->excobj); 308 if (retobj == NULL) 309 goto errorexit; 310 311 if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || 312 !PyUnicode_Check((tobj = PyTuple_GET_ITEM(retobj, 0))) || 313 !(PyInt_Check(PyTuple_GET_ITEM(retobj, 1)) || 314 PyLong_Check(PyTuple_GET_ITEM(retobj, 1)))) { 315 PyErr_SetString(PyExc_TypeError, 316 "encoding error handler must return " 317 "(unicode, int) tuple"); 318 goto errorexit; 319 } 320 321 { 322 const Py_UNICODE *uraw = PyUnicode_AS_UNICODE(tobj); 323 324 retstr = multibytecodec_encode(codec, state, &uraw, 325 PyUnicode_GET_SIZE(tobj), ERROR_STRICT, 326 MBENC_FLUSH); 327 if (retstr == NULL) 328 goto errorexit; 329 } 330 331 retstrsize = PyString_GET_SIZE(retstr); 332 if (retstrsize > 0) { 333 REQUIRE_ENCODEBUFFER(buf, retstrsize); 334 memcpy(buf->outbuf, PyString_AS_STRING(retstr), retstrsize); 335 buf->outbuf += retstrsize; 336 } 337 338 newpos = PyInt_AsSsize_t(PyTuple_GET_ITEM(retobj, 1)); 339 if (newpos < 0 && !PyErr_Occurred()) 340 newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); 341 if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) { 342 PyErr_Clear(); 343 PyErr_Format(PyExc_IndexError, 344 "position %zd from error handler out of bounds", 345 newpos); 346 goto errorexit; 347 } 348 buf->inbuf = buf->inbuf_top + newpos; 349 350 Py_DECREF(retobj); 351 Py_DECREF(retstr); 352 return 0; 353 354errorexit: 355 Py_XDECREF(retobj); 356 Py_XDECREF(retstr); 357 return -1; 358} 359 360static int 361multibytecodec_decerror(MultibyteCodec *codec, 362 MultibyteCodec_State *state, 363 MultibyteDecodeBuffer *buf, 364 PyObject *errors, Py_ssize_t e) 365{ 366 PyObject *retobj = NULL, *retuni = NULL; 367 Py_ssize_t retunisize, newpos; 368 const char *reason; 369 Py_ssize_t esize, start, end; 370 371 if (e > 0) { 372 reason = "illegal multibyte sequence"; 373 esize = e; 374 } 375 else { 376 switch (e) { 377 case MBERR_TOOSMALL: 378 REQUIRE_DECODEBUFFER(buf, -1); 379 return 0; /* retry it */ 380 case MBERR_TOOFEW: 381 reason = "incomplete multibyte sequence"; 382 esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); 383 break; 384 case MBERR_INTERNAL: 385 PyErr_SetString(PyExc_RuntimeError, 386 "internal codec error"); 387 return -1; 388 default: 389 PyErr_SetString(PyExc_RuntimeError, 390 "unknown runtime error"); 391 return -1; 392 } 393 } 394 395 if (errors == ERROR_REPLACE) { 396 REQUIRE_DECODEBUFFER(buf, 1); 397 *buf->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER; 398 } 399 if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) { 400 buf->inbuf += esize; 401 return 0; 402 } 403 404 start = (Py_ssize_t)(buf->inbuf - buf->inbuf_top); 405 end = start + esize; 406 407 /* use cached exception object if available */ 408 if (buf->excobj == NULL) { 409 buf->excobj = PyUnicodeDecodeError_Create(codec->encoding, 410 (const char *)buf->inbuf_top, 411 (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top), 412 start, end, reason); 413 if (buf->excobj == NULL) 414 goto errorexit; 415 } 416 else 417 if (PyUnicodeDecodeError_SetStart(buf->excobj, start) || 418 PyUnicodeDecodeError_SetEnd(buf->excobj, end) || 419 PyUnicodeDecodeError_SetReason(buf->excobj, reason)) 420 goto errorexit; 421 422 if (errors == ERROR_STRICT) { 423 PyCodec_StrictErrors(buf->excobj); 424 goto errorexit; 425 } 426 427 retobj = call_error_callback(errors, buf->excobj); 428 if (retobj == NULL) 429 goto errorexit; 430 431 if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || 432 !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) || 433 !(PyInt_Check(PyTuple_GET_ITEM(retobj, 1)) || 434 PyLong_Check(PyTuple_GET_ITEM(retobj, 1)))) { 435 PyErr_SetString(PyExc_TypeError, 436 "decoding error handler must return " 437 "(unicode, int) tuple"); 438 goto errorexit; 439 } 440 441 retunisize = PyUnicode_GET_SIZE(retuni); 442 if (retunisize > 0) { 443 REQUIRE_DECODEBUFFER(buf, retunisize); 444 memcpy((char *)buf->outbuf, PyUnicode_AS_DATA(retuni), 445 retunisize * Py_UNICODE_SIZE); 446 buf->outbuf += retunisize; 447 } 448 449 newpos = PyInt_AsSsize_t(PyTuple_GET_ITEM(retobj, 1)); 450 if (newpos < 0 && !PyErr_Occurred()) 451 newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); 452 if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) { 453 PyErr_Clear(); 454 PyErr_Format(PyExc_IndexError, 455 "position %zd from error handler out of bounds", 456 newpos); 457 goto errorexit; 458 } 459 buf->inbuf = buf->inbuf_top + newpos; 460 Py_DECREF(retobj); 461 return 0; 462 463errorexit: 464 Py_XDECREF(retobj); 465 return -1; 466} 467 468static PyObject * 469multibytecodec_encode(MultibyteCodec *codec, 470 MultibyteCodec_State *state, 471 const Py_UNICODE **data, Py_ssize_t datalen, 472 PyObject *errors, int flags) 473{ 474 MultibyteEncodeBuffer buf; 475 Py_ssize_t finalsize, r = 0; 476 477 if (datalen == 0 && !(flags & MBENC_RESET)) 478 return PyString_FromString(""); 479 480 buf.excobj = NULL; 481 buf.inbuf = buf.inbuf_top = *data; 482 buf.inbuf_end = buf.inbuf_top + datalen; 483 484 if (datalen > (PY_SSIZE_T_MAX - 16) / 2) { 485 PyErr_NoMemory(); 486 goto errorexit; 487 } 488 489 buf.outobj = PyString_FromStringAndSize(NULL, datalen * 2 + 16); 490 if (buf.outobj == NULL) 491 goto errorexit; 492 buf.outbuf = (unsigned char *)PyString_AS_STRING(buf.outobj); 493 buf.outbuf_end = buf.outbuf + PyString_GET_SIZE(buf.outobj); 494 495 while (buf.inbuf < buf.inbuf_end) { 496 Py_ssize_t inleft, outleft; 497 498 /* we don't reuse inleft and outleft here. 499 * error callbacks can relocate the cursor anywhere on buffer*/ 500 inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf); 501 outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); 502 r = codec->encode(state, codec->config, &buf.inbuf, inleft, 503 &buf.outbuf, outleft, flags); 504 if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH))) 505 break; 506 else if (multibytecodec_encerror(codec, state, &buf, errors,r)) 507 goto errorexit; 508 else if (r == MBERR_TOOFEW) 509 break; 510 } 511 512 if (codec->encreset != NULL && (flags & MBENC_RESET)) 513 for (;;) { 514 Py_ssize_t outleft; 515 516 outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); 517 r = codec->encreset(state, codec->config, &buf.outbuf, 518 outleft); 519 if (r == 0) 520 break; 521 else if (multibytecodec_encerror(codec, state, 522 &buf, errors, r)) 523 goto errorexit; 524 } 525 526 finalsize = (Py_ssize_t)((char *)buf.outbuf - 527 PyString_AS_STRING(buf.outobj)); 528 529 if (finalsize != PyString_GET_SIZE(buf.outobj)) 530 if (_PyString_Resize(&buf.outobj, finalsize) == -1) 531 goto errorexit; 532 533 *data = buf.inbuf; 534 Py_XDECREF(buf.excobj); 535 return buf.outobj; 536 537errorexit: 538 Py_XDECREF(buf.excobj); 539 Py_XDECREF(buf.outobj); 540 return NULL; 541} 542 543static PyObject * 544MultibyteCodec_Encode(MultibyteCodecObject *self, 545 PyObject *args, PyObject *kwargs) 546{ 547 MultibyteCodec_State state; 548 Py_UNICODE *data; 549 PyObject *errorcb, *r, *arg, *ucvt; 550 const char *errors = NULL; 551 Py_ssize_t datalen; 552 553 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|z:encode", 554 codeckwarglist, &arg, &errors)) 555 return NULL; 556 557 if (PyUnicode_Check(arg)) 558 ucvt = NULL; 559 else { 560 arg = ucvt = PyObject_Unicode(arg); 561 if (arg == NULL) 562 return NULL; 563 else if (!PyUnicode_Check(arg)) { 564 PyErr_SetString(PyExc_TypeError, 565 "couldn't convert the object to unicode."); 566 Py_DECREF(ucvt); 567 return NULL; 568 } 569 } 570 571 data = PyUnicode_AS_UNICODE(arg); 572 datalen = PyUnicode_GET_SIZE(arg); 573 574 errorcb = internal_error_callback(errors); 575 if (errorcb == NULL) { 576 Py_XDECREF(ucvt); 577 return NULL; 578 } 579 580 if (self->codec->encinit != NULL && 581 self->codec->encinit(&state, self->codec->config) != 0) 582 goto errorexit; 583 r = multibytecodec_encode(self->codec, &state, 584 (const Py_UNICODE **)&data, datalen, errorcb, 585 MBENC_FLUSH | MBENC_RESET); 586 if (r == NULL) 587 goto errorexit; 588 589 ERROR_DECREF(errorcb); 590 Py_XDECREF(ucvt); 591 return make_tuple(r, datalen); 592 593errorexit: 594 ERROR_DECREF(errorcb); 595 Py_XDECREF(ucvt); 596 return NULL; 597} 598 599static PyObject * 600MultibyteCodec_Decode(MultibyteCodecObject *self, 601 PyObject *args, PyObject *kwargs) 602{ 603 MultibyteCodec_State state; 604 MultibyteDecodeBuffer buf; 605 PyObject *errorcb; 606 Py_buffer pdata; 607 const char *data, *errors = NULL; 608 Py_ssize_t datalen, finalsize; 609 610 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|z:decode", 611 codeckwarglist, &pdata, &errors)) 612 return NULL; 613 data = pdata.buf; 614 datalen = pdata.len; 615 616 errorcb = internal_error_callback(errors); 617 if (errorcb == NULL) { 618 PyBuffer_Release(&pdata); 619 return NULL; 620 } 621 622 if (datalen == 0) { 623 PyBuffer_Release(&pdata); 624 ERROR_DECREF(errorcb); 625 return make_tuple(PyUnicode_FromUnicode(NULL, 0), 0); 626 } 627 628 buf.excobj = NULL; 629 buf.inbuf = buf.inbuf_top = (unsigned char *)data; 630 buf.inbuf_end = buf.inbuf_top + datalen; 631 buf.outobj = PyUnicode_FromUnicode(NULL, datalen); 632 if (buf.outobj == NULL) 633 goto errorexit; 634 buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj); 635 buf.outbuf_end = buf.outbuf + PyUnicode_GET_SIZE(buf.outobj); 636 637 if (self->codec->decinit != NULL && 638 self->codec->decinit(&state, self->codec->config) != 0) 639 goto errorexit; 640 641 while (buf.inbuf < buf.inbuf_end) { 642 Py_ssize_t inleft, outleft, r; 643 644 inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf); 645 outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); 646 647 r = self->codec->decode(&state, self->codec->config, 648 &buf.inbuf, inleft, &buf.outbuf, outleft); 649 if (r == 0) 650 break; 651 else if (multibytecodec_decerror(self->codec, &state, 652 &buf, errorcb, r)) 653 goto errorexit; 654 } 655 656 finalsize = (Py_ssize_t)(buf.outbuf - 657 PyUnicode_AS_UNICODE(buf.outobj)); 658 659 if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) 660 if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) 661 goto errorexit; 662 663 PyBuffer_Release(&pdata); 664 Py_XDECREF(buf.excobj); 665 ERROR_DECREF(errorcb); 666 return make_tuple(buf.outobj, datalen); 667 668errorexit: 669 PyBuffer_Release(&pdata); 670 ERROR_DECREF(errorcb); 671 Py_XDECREF(buf.excobj); 672 Py_XDECREF(buf.outobj); 673 674 return NULL; 675} 676 677static struct PyMethodDef multibytecodec_methods[] = { 678 {"encode", (PyCFunction)MultibyteCodec_Encode, 679 METH_VARARGS | METH_KEYWORDS, 680 MultibyteCodec_Encode__doc__}, 681 {"decode", (PyCFunction)MultibyteCodec_Decode, 682 METH_VARARGS | METH_KEYWORDS, 683 MultibyteCodec_Decode__doc__}, 684 {NULL, NULL}, 685}; 686 687static void 688multibytecodec_dealloc(MultibyteCodecObject *self) 689{ 690 PyObject_Del(self); 691} 692 693static PyTypeObject MultibyteCodec_Type = { 694 PyVarObject_HEAD_INIT(NULL, 0) 695 "MultibyteCodec", /* tp_name */ 696 sizeof(MultibyteCodecObject), /* tp_basicsize */ 697 0, /* tp_itemsize */ 698 /* methods */ 699 (destructor)multibytecodec_dealloc, /* tp_dealloc */ 700 0, /* tp_print */ 701 0, /* tp_getattr */ 702 0, /* tp_setattr */ 703 0, /* tp_compare */ 704 0, /* tp_repr */ 705 0, /* tp_as_number */ 706 0, /* tp_as_sequence */ 707 0, /* tp_as_mapping */ 708 0, /* tp_hash */ 709 0, /* tp_call */ 710 0, /* tp_str */ 711 PyObject_GenericGetAttr, /* tp_getattro */ 712 0, /* tp_setattro */ 713 0, /* tp_as_buffer */ 714 Py_TPFLAGS_DEFAULT, /* tp_flags */ 715 0, /* tp_doc */ 716 0, /* tp_traverse */ 717 0, /* tp_clear */ 718 0, /* tp_richcompare */ 719 0, /* tp_weaklistoffset */ 720 0, /* tp_iter */ 721 0, /* tp_iterext */ 722 multibytecodec_methods, /* tp_methods */ 723}; 724 725 726/** 727 * Utility functions for stateful codec mechanism 728 */ 729 730#define STATEFUL_DCTX(o) ((MultibyteStatefulDecoderContext *)(o)) 731#define STATEFUL_ECTX(o) ((MultibyteStatefulEncoderContext *)(o)) 732 733static PyObject * 734encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, 735 PyObject *unistr, int final) 736{ 737 PyObject *ucvt, *r = NULL; 738 Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; 739 Py_ssize_t datalen, origpending; 740 741 if (PyUnicode_Check(unistr)) 742 ucvt = NULL; 743 else { 744 unistr = ucvt = PyObject_Unicode(unistr); 745 if (unistr == NULL) 746 return NULL; 747 else if (!PyUnicode_Check(unistr)) { 748 PyErr_SetString(PyExc_TypeError, 749 "couldn't convert the object to unicode."); 750 Py_DECREF(ucvt); 751 return NULL; 752 } 753 } 754 755 datalen = PyUnicode_GET_SIZE(unistr); 756 origpending = ctx->pendingsize; 757 758 if (origpending > 0) { 759 if (datalen > PY_SSIZE_T_MAX - ctx->pendingsize) { 760 PyErr_NoMemory(); 761 /* inbuf_tmp == NULL */ 762 goto errorexit; 763 } 764 inbuf_tmp = PyMem_New(Py_UNICODE, datalen + ctx->pendingsize); 765 if (inbuf_tmp == NULL) 766 goto errorexit; 767 memcpy(inbuf_tmp, ctx->pending, 768 Py_UNICODE_SIZE * ctx->pendingsize); 769 memcpy(inbuf_tmp + ctx->pendingsize, 770 PyUnicode_AS_UNICODE(unistr), 771 Py_UNICODE_SIZE * datalen); 772 datalen += ctx->pendingsize; 773 ctx->pendingsize = 0; 774 inbuf = inbuf_tmp; 775 } 776 else 777 inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); 778 779 inbuf_end = inbuf + datalen; 780 781 r = multibytecodec_encode(ctx->codec, &ctx->state, 782 (const Py_UNICODE **)&inbuf, datalen, 783 ctx->errors, final ? MBENC_FLUSH | MBENC_RESET : 0); 784 if (r == NULL) { 785 /* recover the original pending buffer */ 786 if (origpending > 0) 787 memcpy(ctx->pending, inbuf_tmp, 788 Py_UNICODE_SIZE * origpending); 789 ctx->pendingsize = origpending; 790 goto errorexit; 791 } 792 793 if (inbuf < inbuf_end) { 794 ctx->pendingsize = (Py_ssize_t)(inbuf_end - inbuf); 795 if (ctx->pendingsize > MAXENCPENDING) { 796 /* normal codecs can't reach here */ 797 ctx->pendingsize = 0; 798 PyErr_SetString(PyExc_UnicodeError, 799 "pending buffer overflow"); 800 goto errorexit; 801 } 802 memcpy(ctx->pending, inbuf, 803 ctx->pendingsize * Py_UNICODE_SIZE); 804 } 805 806 if (inbuf_tmp != NULL) 807 PyMem_Del(inbuf_tmp); 808 Py_XDECREF(ucvt); 809 return r; 810 811errorexit: 812 if (inbuf_tmp != NULL) 813 PyMem_Del(inbuf_tmp); 814 Py_XDECREF(r); 815 Py_XDECREF(ucvt); 816 return NULL; 817} 818 819static int 820decoder_append_pending(MultibyteStatefulDecoderContext *ctx, 821 MultibyteDecodeBuffer *buf) 822{ 823 Py_ssize_t npendings; 824 825 npendings = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); 826 if (npendings + ctx->pendingsize > MAXDECPENDING || 827 npendings > PY_SSIZE_T_MAX - ctx->pendingsize) { 828 PyErr_SetString(PyExc_UnicodeError, "pending buffer overflow"); 829 return -1; 830 } 831 memcpy(ctx->pending + ctx->pendingsize, buf->inbuf, npendings); 832 ctx->pendingsize += npendings; 833 return 0; 834} 835 836static int 837decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data, 838 Py_ssize_t size) 839{ 840 buf->inbuf = buf->inbuf_top = (const unsigned char *)data; 841 buf->inbuf_end = buf->inbuf_top + size; 842 if (buf->outobj == NULL) { /* only if outobj is not allocated yet */ 843 buf->outobj = PyUnicode_FromUnicode(NULL, size); 844 if (buf->outobj == NULL) 845 return -1; 846 buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj); 847 buf->outbuf_end = buf->outbuf + 848 PyUnicode_GET_SIZE(buf->outobj); 849 } 850 851 return 0; 852} 853 854static int 855decoder_feed_buffer(MultibyteStatefulDecoderContext *ctx, 856 MultibyteDecodeBuffer *buf) 857{ 858 while (buf->inbuf < buf->inbuf_end) { 859 Py_ssize_t inleft, outleft; 860 Py_ssize_t r; 861 862 inleft = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); 863 outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); 864 865 r = ctx->codec->decode(&ctx->state, ctx->codec->config, 866 &buf->inbuf, inleft, &buf->outbuf, outleft); 867 if (r == 0 || r == MBERR_TOOFEW) 868 break; 869 else if (multibytecodec_decerror(ctx->codec, &ctx->state, 870 buf, ctx->errors, r)) 871 return -1; 872 } 873 return 0; 874} 875 876 877/** 878 * MultibyteIncrementalEncoder object 879 */ 880 881static PyObject * 882mbiencoder_encode(MultibyteIncrementalEncoderObject *self, 883 PyObject *args, PyObject *kwargs) 884{ 885 PyObject *data; 886 int final = 0; 887 888 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i:encode", 889 incrementalkwarglist, &data, &final)) 890 return NULL; 891 892 return encoder_encode_stateful(STATEFUL_ECTX(self), data, final); 893} 894 895static PyObject * 896mbiencoder_reset(MultibyteIncrementalEncoderObject *self) 897{ 898 if (self->codec->decreset != NULL && 899 self->codec->decreset(&self->state, self->codec->config) != 0) 900 return NULL; 901 self->pendingsize = 0; 902 903 Py_RETURN_NONE; 904} 905 906static struct PyMethodDef mbiencoder_methods[] = { 907 {"encode", (PyCFunction)mbiencoder_encode, 908 METH_VARARGS | METH_KEYWORDS, NULL}, 909 {"reset", (PyCFunction)mbiencoder_reset, 910 METH_NOARGS, NULL}, 911 {NULL, NULL}, 912}; 913 914static PyObject * 915mbiencoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 916{ 917 MultibyteIncrementalEncoderObject *self; 918 PyObject *codec = NULL; 919 char *errors = NULL; 920 921 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|s:IncrementalEncoder", 922 incnewkwarglist, &errors)) 923 return NULL; 924 925 self = (MultibyteIncrementalEncoderObject *)type->tp_alloc(type, 0); 926 if (self == NULL) 927 return NULL; 928 929 codec = PyObject_GetAttrString((PyObject *)type, "codec"); 930 if (codec == NULL) 931 goto errorexit; 932 if (!MultibyteCodec_Check(codec)) { 933 PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); 934 goto errorexit; 935 } 936 937 self->codec = ((MultibyteCodecObject *)codec)->codec; 938 self->pendingsize = 0; 939 self->errors = internal_error_callback(errors); 940 if (self->errors == NULL) 941 goto errorexit; 942 if (self->codec->encinit != NULL && 943 self->codec->encinit(&self->state, self->codec->config) != 0) 944 goto errorexit; 945 946 Py_DECREF(codec); 947 return (PyObject *)self; 948 949errorexit: 950 Py_XDECREF(self); 951 Py_XDECREF(codec); 952 return NULL; 953} 954 955static int 956mbiencoder_init(PyObject *self, PyObject *args, PyObject *kwds) 957{ 958 return 0; 959} 960 961static int 962mbiencoder_traverse(MultibyteIncrementalEncoderObject *self, 963 visitproc visit, void *arg) 964{ 965 if (ERROR_ISCUSTOM(self->errors)) 966 Py_VISIT(self->errors); 967 return 0; 968} 969 970static void 971mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self) 972{ 973 PyObject_GC_UnTrack(self); 974 ERROR_DECREF(self->errors); 975 Py_TYPE(self)->tp_free(self); 976} 977 978static PyTypeObject MultibyteIncrementalEncoder_Type = { 979 PyVarObject_HEAD_INIT(NULL, 0) 980 "MultibyteIncrementalEncoder", /* tp_name */ 981 sizeof(MultibyteIncrementalEncoderObject), /* tp_basicsize */ 982 0, /* tp_itemsize */ 983 /* methods */ 984 (destructor)mbiencoder_dealloc, /* tp_dealloc */ 985 0, /* tp_print */ 986 0, /* tp_getattr */ 987 0, /* tp_setattr */ 988 0, /* tp_compare */ 989 0, /* tp_repr */ 990 0, /* tp_as_number */ 991 0, /* tp_as_sequence */ 992 0, /* tp_as_mapping */ 993 0, /* tp_hash */ 994 0, /* tp_call */ 995 0, /* tp_str */ 996 PyObject_GenericGetAttr, /* tp_getattro */ 997 0, /* tp_setattro */ 998 0, /* tp_as_buffer */ 999 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC 1000 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 1001 0, /* tp_doc */ 1002 (traverseproc)mbiencoder_traverse, /* tp_traverse */ 1003 0, /* tp_clear */ 1004 0, /* tp_richcompare */ 1005 0, /* tp_weaklistoffset */ 1006 0, /* tp_iter */ 1007 0, /* tp_iterext */ 1008 mbiencoder_methods, /* tp_methods */ 1009 0, /* tp_members */ 1010 codecctx_getsets, /* tp_getset */ 1011 0, /* tp_base */ 1012 0, /* tp_dict */ 1013 0, /* tp_descr_get */ 1014 0, /* tp_descr_set */ 1015 0, /* tp_dictoffset */ 1016 mbiencoder_init, /* tp_init */ 1017 0, /* tp_alloc */ 1018 mbiencoder_new, /* tp_new */ 1019}; 1020 1021 1022/** 1023 * MultibyteIncrementalDecoder object 1024 */ 1025 1026static PyObject * 1027mbidecoder_decode(MultibyteIncrementalDecoderObject *self, 1028 PyObject *args, PyObject *kwargs) 1029{ 1030 MultibyteDecodeBuffer buf; 1031 char *data, *wdata = NULL; 1032 Py_buffer pdata; 1033 Py_ssize_t wsize, finalsize = 0, size, origpending; 1034 int final = 0; 1035 1036 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s*|i:decode", 1037 incrementalkwarglist, &pdata, &final)) 1038 return NULL; 1039 data = pdata.buf; 1040 size = pdata.len; 1041 1042 buf.outobj = buf.excobj = NULL; 1043 origpending = self->pendingsize; 1044 1045 if (self->pendingsize == 0) { 1046 wsize = size; 1047 wdata = data; 1048 } 1049 else { 1050 if (size > PY_SSIZE_T_MAX - self->pendingsize) { 1051 PyErr_NoMemory(); 1052 goto errorexit; 1053 } 1054 wsize = size + self->pendingsize; 1055 wdata = PyMem_Malloc(wsize); 1056 if (wdata == NULL) 1057 goto errorexit; 1058 memcpy(wdata, self->pending, self->pendingsize); 1059 memcpy(wdata + self->pendingsize, data, size); 1060 self->pendingsize = 0; 1061 } 1062 1063 if (decoder_prepare_buffer(&buf, wdata, wsize) != 0) 1064 goto errorexit; 1065 1066 if (decoder_feed_buffer(STATEFUL_DCTX(self), &buf)) 1067 goto errorexit; 1068 1069 if (final && buf.inbuf < buf.inbuf_end) { 1070 if (multibytecodec_decerror(self->codec, &self->state, 1071 &buf, self->errors, MBERR_TOOFEW)) { 1072 /* recover the original pending buffer */ 1073 memcpy(self->pending, wdata, origpending); 1074 self->pendingsize = origpending; 1075 goto errorexit; 1076 } 1077 } 1078 1079 if (buf.inbuf < buf.inbuf_end) { /* pending sequence still exists */ 1080 if (decoder_append_pending(STATEFUL_DCTX(self), &buf) != 0) 1081 goto errorexit; 1082 } 1083 1084 finalsize = (Py_ssize_t)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj)); 1085 if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) 1086 if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) 1087 goto errorexit; 1088 1089 PyBuffer_Release(&pdata); 1090 if (wdata != data) 1091 PyMem_Del(wdata); 1092 Py_XDECREF(buf.excobj); 1093 return buf.outobj; 1094 1095errorexit: 1096 PyBuffer_Release(&pdata); 1097 if (wdata != NULL && wdata != data) 1098 PyMem_Del(wdata); 1099 Py_XDECREF(buf.excobj); 1100 Py_XDECREF(buf.outobj); 1101 return NULL; 1102} 1103 1104static PyObject * 1105mbidecoder_reset(MultibyteIncrementalDecoderObject *self) 1106{ 1107 if (self->codec->decreset != NULL && 1108 self->codec->decreset(&self->state, self->codec->config) != 0) 1109 return NULL; 1110 self->pendingsize = 0; 1111 1112 Py_RETURN_NONE; 1113} 1114 1115static struct PyMethodDef mbidecoder_methods[] = { 1116 {"decode", (PyCFunction)mbidecoder_decode, 1117 METH_VARARGS | METH_KEYWORDS, NULL}, 1118 {"reset", (PyCFunction)mbidecoder_reset, 1119 METH_NOARGS, NULL}, 1120 {NULL, NULL}, 1121}; 1122 1123static PyObject * 1124mbidecoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 1125{ 1126 MultibyteIncrementalDecoderObject *self; 1127 PyObject *codec = NULL; 1128 char *errors = NULL; 1129 1130 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|s:IncrementalDecoder", 1131 incnewkwarglist, &errors)) 1132 return NULL; 1133 1134 self = (MultibyteIncrementalDecoderObject *)type->tp_alloc(type, 0); 1135 if (self == NULL) 1136 return NULL; 1137 1138 codec = PyObject_GetAttrString((PyObject *)type, "codec"); 1139 if (codec == NULL) 1140 goto errorexit; 1141 if (!MultibyteCodec_Check(codec)) { 1142 PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); 1143 goto errorexit; 1144 } 1145 1146 self->codec = ((MultibyteCodecObject *)codec)->codec; 1147 self->pendingsize = 0; 1148 self->errors = internal_error_callback(errors); 1149 if (self->errors == NULL) 1150 goto errorexit; 1151 if (self->codec->decinit != NULL && 1152 self->codec->decinit(&self->state, self->codec->config) != 0) 1153 goto errorexit; 1154 1155 Py_DECREF(codec); 1156 return (PyObject *)self; 1157 1158errorexit: 1159 Py_XDECREF(self); 1160 Py_XDECREF(codec); 1161 return NULL; 1162} 1163 1164static int 1165mbidecoder_init(PyObject *self, PyObject *args, PyObject *kwds) 1166{ 1167 return 0; 1168} 1169 1170static int 1171mbidecoder_traverse(MultibyteIncrementalDecoderObject *self, 1172 visitproc visit, void *arg) 1173{ 1174 if (ERROR_ISCUSTOM(self->errors)) 1175 Py_VISIT(self->errors); 1176 return 0; 1177} 1178 1179static void 1180mbidecoder_dealloc(MultibyteIncrementalDecoderObject *self) 1181{ 1182 PyObject_GC_UnTrack(self); 1183 ERROR_DECREF(self->errors); 1184 Py_TYPE(self)->tp_free(self); 1185} 1186 1187static PyTypeObject MultibyteIncrementalDecoder_Type = { 1188 PyVarObject_HEAD_INIT(NULL, 0) 1189 "MultibyteIncrementalDecoder", /* tp_name */ 1190 sizeof(MultibyteIncrementalDecoderObject), /* tp_basicsize */ 1191 0, /* tp_itemsize */ 1192 /* methods */ 1193 (destructor)mbidecoder_dealloc, /* tp_dealloc */ 1194 0, /* tp_print */ 1195 0, /* tp_getattr */ 1196 0, /* tp_setattr */ 1197 0, /* tp_compare */ 1198 0, /* tp_repr */ 1199 0, /* tp_as_number */ 1200 0, /* tp_as_sequence */ 1201 0, /* tp_as_mapping */ 1202 0, /* tp_hash */ 1203 0, /* tp_call */ 1204 0, /* tp_str */ 1205 PyObject_GenericGetAttr, /* tp_getattro */ 1206 0, /* tp_setattro */ 1207 0, /* tp_as_buffer */ 1208 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC 1209 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 1210 0, /* tp_doc */ 1211 (traverseproc)mbidecoder_traverse, /* tp_traverse */ 1212 0, /* tp_clear */ 1213 0, /* tp_richcompare */ 1214 0, /* tp_weaklistoffset */ 1215 0, /* tp_iter */ 1216 0, /* tp_iterext */ 1217 mbidecoder_methods, /* tp_methods */ 1218 0, /* tp_members */ 1219 codecctx_getsets, /* tp_getset */ 1220 0, /* tp_base */ 1221 0, /* tp_dict */ 1222 0, /* tp_descr_get */ 1223 0, /* tp_descr_set */ 1224 0, /* tp_dictoffset */ 1225 mbidecoder_init, /* tp_init */ 1226 0, /* tp_alloc */ 1227 mbidecoder_new, /* tp_new */ 1228}; 1229 1230 1231/** 1232 * MultibyteStreamReader object 1233 */ 1234 1235static PyObject * 1236mbstreamreader_iread(MultibyteStreamReaderObject *self, 1237 const char *method, Py_ssize_t sizehint) 1238{ 1239 MultibyteDecodeBuffer buf; 1240 PyObject *cres; 1241 Py_ssize_t rsize, finalsize = 0; 1242 1243 if (sizehint == 0) 1244 return PyUnicode_FromUnicode(NULL, 0); 1245 1246 buf.outobj = buf.excobj = NULL; 1247 cres = NULL; 1248 1249 for (;;) { 1250 int endoffile; 1251 1252 if (sizehint < 0) 1253 cres = PyObject_CallMethod(self->stream, 1254 (char *)method, NULL); 1255 else 1256 cres = PyObject_CallMethod(self->stream, 1257 (char *)method, "i", sizehint); 1258 if (cres == NULL) 1259 goto errorexit; 1260 1261 if (!PyString_Check(cres)) { 1262 PyErr_SetString(PyExc_TypeError, 1263 "stream function returned a " 1264 "non-string object"); 1265 goto errorexit; 1266 } 1267 1268 endoffile = (PyString_GET_SIZE(cres) == 0); 1269 1270 if (self->pendingsize > 0) { 1271 PyObject *ctr; 1272 char *ctrdata; 1273 1274 if (PyString_GET_SIZE(cres) > PY_SSIZE_T_MAX - self->pendingsize) { 1275 PyErr_NoMemory(); 1276 goto errorexit; 1277 } 1278 rsize = PyString_GET_SIZE(cres) + self->pendingsize; 1279 ctr = PyString_FromStringAndSize(NULL, rsize); 1280 if (ctr == NULL) 1281 goto errorexit; 1282 ctrdata = PyString_AS_STRING(ctr); 1283 memcpy(ctrdata, self->pending, self->pendingsize); 1284 memcpy(ctrdata + self->pendingsize, 1285 PyString_AS_STRING(cres), 1286 PyString_GET_SIZE(cres)); 1287 Py_DECREF(cres); 1288 cres = ctr; 1289 self->pendingsize = 0; 1290 } 1291 1292 rsize = PyString_GET_SIZE(cres); 1293 if (decoder_prepare_buffer(&buf, PyString_AS_STRING(cres), 1294 rsize) != 0) 1295 goto errorexit; 1296 1297 if (rsize > 0 && decoder_feed_buffer( 1298 (MultibyteStatefulDecoderContext *)self, &buf)) 1299 goto errorexit; 1300 1301 if (endoffile || sizehint < 0) { 1302 if (buf.inbuf < buf.inbuf_end && 1303 multibytecodec_decerror(self->codec, &self->state, 1304 &buf, self->errors, MBERR_TOOFEW)) 1305 goto errorexit; 1306 } 1307 1308 if (buf.inbuf < buf.inbuf_end) { /* pending sequence exists */ 1309 if (decoder_append_pending(STATEFUL_DCTX(self), 1310 &buf) != 0) 1311 goto errorexit; 1312 } 1313 1314 finalsize = (Py_ssize_t)(buf.outbuf - 1315 PyUnicode_AS_UNICODE(buf.outobj)); 1316 Py_DECREF(cres); 1317 cres = NULL; 1318 1319 if (sizehint < 0 || finalsize != 0 || rsize == 0) 1320 break; 1321 1322 sizehint = 1; /* read 1 more byte and retry */ 1323 } 1324 1325 if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) 1326 if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) 1327 goto errorexit; 1328 1329 Py_XDECREF(cres); 1330 Py_XDECREF(buf.excobj); 1331 return buf.outobj; 1332 1333errorexit: 1334 Py_XDECREF(cres); 1335 Py_XDECREF(buf.excobj); 1336 Py_XDECREF(buf.outobj); 1337 return NULL; 1338} 1339 1340static PyObject * 1341mbstreamreader_read(MultibyteStreamReaderObject *self, PyObject *args) 1342{ 1343 PyObject *sizeobj = NULL; 1344 Py_ssize_t size; 1345 1346 if (!PyArg_UnpackTuple(args, "read", 0, 1, &sizeobj)) 1347 return NULL; 1348 1349 if (sizeobj == Py_None || sizeobj == NULL) 1350 size = -1; 1351 else if (PyInt_Check(sizeobj)) 1352 size = PyInt_AsSsize_t(sizeobj); 1353 else { 1354 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer"); 1355 return NULL; 1356 } 1357 1358 return mbstreamreader_iread(self, "read", size); 1359} 1360 1361static PyObject * 1362mbstreamreader_readline(MultibyteStreamReaderObject *self, PyObject *args) 1363{ 1364 PyObject *sizeobj = NULL; 1365 Py_ssize_t size; 1366 1367 if (!PyArg_UnpackTuple(args, "readline", 0, 1, &sizeobj)) 1368 return NULL; 1369 1370 if (sizeobj == Py_None || sizeobj == NULL) 1371 size = -1; 1372 else if (PyInt_Check(sizeobj)) 1373 size = PyInt_AsSsize_t(sizeobj); 1374 else { 1375 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer"); 1376 return NULL; 1377 } 1378 1379 return mbstreamreader_iread(self, "readline", size); 1380} 1381 1382static PyObject * 1383mbstreamreader_readlines(MultibyteStreamReaderObject *self, PyObject *args) 1384{ 1385 PyObject *sizehintobj = NULL, *r, *sr; 1386 Py_ssize_t sizehint; 1387 1388 if (!PyArg_UnpackTuple(args, "readlines", 0, 1, &sizehintobj)) 1389 return NULL; 1390 1391 if (sizehintobj == Py_None || sizehintobj == NULL) 1392 sizehint = -1; 1393 else if (PyInt_Check(sizehintobj)) 1394 sizehint = PyInt_AsSsize_t(sizehintobj); 1395 else { 1396 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer"); 1397 return NULL; 1398 } 1399 1400 r = mbstreamreader_iread(self, "read", sizehint); 1401 if (r == NULL) 1402 return NULL; 1403 1404 sr = PyUnicode_Splitlines(r, 1); 1405 Py_DECREF(r); 1406 return sr; 1407} 1408 1409static PyObject * 1410mbstreamreader_reset(MultibyteStreamReaderObject *self) 1411{ 1412 if (self->codec->decreset != NULL && 1413 self->codec->decreset(&self->state, self->codec->config) != 0) 1414 return NULL; 1415 self->pendingsize = 0; 1416 1417 Py_RETURN_NONE; 1418} 1419 1420static struct PyMethodDef mbstreamreader_methods[] = { 1421 {"read", (PyCFunction)mbstreamreader_read, 1422 METH_VARARGS, NULL}, 1423 {"readline", (PyCFunction)mbstreamreader_readline, 1424 METH_VARARGS, NULL}, 1425 {"readlines", (PyCFunction)mbstreamreader_readlines, 1426 METH_VARARGS, NULL}, 1427 {"reset", (PyCFunction)mbstreamreader_reset, 1428 METH_NOARGS, NULL}, 1429 {NULL, NULL}, 1430}; 1431 1432static PyMemberDef mbstreamreader_members[] = { 1433 {"stream", T_OBJECT, 1434 offsetof(MultibyteStreamReaderObject, stream), 1435 READONLY, NULL}, 1436 {NULL,} 1437}; 1438 1439static PyObject * 1440mbstreamreader_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 1441{ 1442 MultibyteStreamReaderObject *self; 1443 PyObject *stream, *codec = NULL; 1444 char *errors = NULL; 1445 1446 if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|s:StreamReader", 1447 streamkwarglist, &stream, &errors)) 1448 return NULL; 1449 1450 self = (MultibyteStreamReaderObject *)type->tp_alloc(type, 0); 1451 if (self == NULL) 1452 return NULL; 1453 1454 codec = PyObject_GetAttrString((PyObject *)type, "codec"); 1455 if (codec == NULL) 1456 goto errorexit; 1457 if (!MultibyteCodec_Check(codec)) { 1458 PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); 1459 goto errorexit; 1460 } 1461 1462 self->codec = ((MultibyteCodecObject *)codec)->codec; 1463 self->stream = stream; 1464 Py_INCREF(stream); 1465 self->pendingsize = 0; 1466 self->errors = internal_error_callback(errors); 1467 if (self->errors == NULL) 1468 goto errorexit; 1469 if (self->codec->decinit != NULL && 1470 self->codec->decinit(&self->state, self->codec->config) != 0) 1471 goto errorexit; 1472 1473 Py_DECREF(codec); 1474 return (PyObject *)self; 1475 1476errorexit: 1477 Py_XDECREF(self); 1478 Py_XDECREF(codec); 1479 return NULL; 1480} 1481 1482static int 1483mbstreamreader_init(PyObject *self, PyObject *args, PyObject *kwds) 1484{ 1485 return 0; 1486} 1487 1488static int 1489mbstreamreader_traverse(MultibyteStreamReaderObject *self, 1490 visitproc visit, void *arg) 1491{ 1492 if (ERROR_ISCUSTOM(self->errors)) 1493 Py_VISIT(self->errors); 1494 Py_VISIT(self->stream); 1495 return 0; 1496} 1497 1498static void 1499mbstreamreader_dealloc(MultibyteStreamReaderObject *self) 1500{ 1501 PyObject_GC_UnTrack(self); 1502 ERROR_DECREF(self->errors); 1503 Py_XDECREF(self->stream); 1504 Py_TYPE(self)->tp_free(self); 1505} 1506 1507static PyTypeObject MultibyteStreamReader_Type = { 1508 PyVarObject_HEAD_INIT(NULL, 0) 1509 "MultibyteStreamReader", /* tp_name */ 1510 sizeof(MultibyteStreamReaderObject), /* tp_basicsize */ 1511 0, /* tp_itemsize */ 1512 /* methods */ 1513 (destructor)mbstreamreader_dealloc, /* tp_dealloc */ 1514 0, /* tp_print */ 1515 0, /* tp_getattr */ 1516 0, /* tp_setattr */ 1517 0, /* tp_compare */ 1518 0, /* tp_repr */ 1519 0, /* tp_as_number */ 1520 0, /* tp_as_sequence */ 1521 0, /* tp_as_mapping */ 1522 0, /* tp_hash */ 1523 0, /* tp_call */ 1524 0, /* tp_str */ 1525 PyObject_GenericGetAttr, /* tp_getattro */ 1526 0, /* tp_setattro */ 1527 0, /* tp_as_buffer */ 1528 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC 1529 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 1530 0, /* tp_doc */ 1531 (traverseproc)mbstreamreader_traverse, /* tp_traverse */ 1532 0, /* tp_clear */ 1533 0, /* tp_richcompare */ 1534 0, /* tp_weaklistoffset */ 1535 0, /* tp_iter */ 1536 0, /* tp_iterext */ 1537 mbstreamreader_methods, /* tp_methods */ 1538 mbstreamreader_members, /* tp_members */ 1539 codecctx_getsets, /* tp_getset */ 1540 0, /* tp_base */ 1541 0, /* tp_dict */ 1542 0, /* tp_descr_get */ 1543 0, /* tp_descr_set */ 1544 0, /* tp_dictoffset */ 1545 mbstreamreader_init, /* tp_init */ 1546 0, /* tp_alloc */ 1547 mbstreamreader_new, /* tp_new */ 1548}; 1549 1550 1551/** 1552 * MultibyteStreamWriter object 1553 */ 1554 1555static int 1556mbstreamwriter_iwrite(MultibyteStreamWriterObject *self, 1557 PyObject *unistr) 1558{ 1559 PyObject *str, *wr; 1560 1561 str = encoder_encode_stateful(STATEFUL_ECTX(self), unistr, 0); 1562 if (str == NULL) 1563 return -1; 1564 1565 wr = PyObject_CallMethod(self->stream, "write", "O", str); 1566 Py_DECREF(str); 1567 if (wr == NULL) 1568 return -1; 1569 1570 Py_DECREF(wr); 1571 return 0; 1572} 1573 1574static PyObject * 1575mbstreamwriter_write(MultibyteStreamWriterObject *self, PyObject *strobj) 1576{ 1577 if (mbstreamwriter_iwrite(self, strobj)) 1578 return NULL; 1579 else 1580 Py_RETURN_NONE; 1581} 1582 1583static PyObject * 1584mbstreamwriter_writelines(MultibyteStreamWriterObject *self, PyObject *lines) 1585{ 1586 PyObject *strobj; 1587 int i, r; 1588 1589 if (!PySequence_Check(lines)) { 1590 PyErr_SetString(PyExc_TypeError, 1591 "arg must be a sequence object"); 1592 return NULL; 1593 } 1594 1595 for (i = 0; i < PySequence_Length(lines); i++) { 1596 /* length can be changed even within this loop */ 1597 strobj = PySequence_GetItem(lines, i); 1598 if (strobj == NULL) 1599 return NULL; 1600 1601 r = mbstreamwriter_iwrite(self, strobj); 1602 Py_DECREF(strobj); 1603 if (r == -1) 1604 return NULL; 1605 } 1606 1607 Py_RETURN_NONE; 1608} 1609 1610static PyObject * 1611mbstreamwriter_reset(MultibyteStreamWriterObject *self) 1612{ 1613 const Py_UNICODE *pending; 1614 PyObject *pwrt; 1615 1616 pending = self->pending; 1617 pwrt = multibytecodec_encode(self->codec, &self->state, 1618 &pending, self->pendingsize, self->errors, 1619 MBENC_FLUSH | MBENC_RESET); 1620 /* some pending buffer can be truncated when UnicodeEncodeError is 1621 * raised on 'strict' mode. but, 'reset' method is designed to 1622 * reset the pending buffer or states so failed string sequence 1623 * ought to be missed */ 1624 self->pendingsize = 0; 1625 if (pwrt == NULL) 1626 return NULL; 1627 1628 if (PyString_Size(pwrt) > 0) { 1629 PyObject *wr; 1630 wr = PyObject_CallMethod(self->stream, "write", "O", pwrt); 1631 if (wr == NULL) { 1632 Py_DECREF(pwrt); 1633 return NULL; 1634 } 1635 } 1636 Py_DECREF(pwrt); 1637 1638 Py_RETURN_NONE; 1639} 1640 1641static PyObject * 1642mbstreamwriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 1643{ 1644 MultibyteStreamWriterObject *self; 1645 PyObject *stream, *codec = NULL; 1646 char *errors = NULL; 1647 1648 if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|s:StreamWriter", 1649 streamkwarglist, &stream, &errors)) 1650 return NULL; 1651 1652 self = (MultibyteStreamWriterObject *)type->tp_alloc(type, 0); 1653 if (self == NULL) 1654 return NULL; 1655 1656 codec = PyObject_GetAttrString((PyObject *)type, "codec"); 1657 if (codec == NULL) 1658 goto errorexit; 1659 if (!MultibyteCodec_Check(codec)) { 1660 PyErr_SetString(PyExc_TypeError, "codec is unexpected type"); 1661 goto errorexit; 1662 } 1663 1664 self->codec = ((MultibyteCodecObject *)codec)->codec; 1665 self->stream = stream; 1666 Py_INCREF(stream); 1667 self->pendingsize = 0; 1668 self->errors = internal_error_callback(errors); 1669 if (self->errors == NULL) 1670 goto errorexit; 1671 if (self->codec->encinit != NULL && 1672 self->codec->encinit(&self->state, self->codec->config) != 0) 1673 goto errorexit; 1674 1675 Py_DECREF(codec); 1676 return (PyObject *)self; 1677 1678errorexit: 1679 Py_XDECREF(self); 1680 Py_XDECREF(codec); 1681 return NULL; 1682} 1683 1684static int 1685mbstreamwriter_init(PyObject *self, PyObject *args, PyObject *kwds) 1686{ 1687 return 0; 1688} 1689 1690static int 1691mbstreamwriter_traverse(MultibyteStreamWriterObject *self, 1692 visitproc visit, void *arg) 1693{ 1694 if (ERROR_ISCUSTOM(self->errors)) 1695 Py_VISIT(self->errors); 1696 Py_VISIT(self->stream); 1697 return 0; 1698} 1699 1700static void 1701mbstreamwriter_dealloc(MultibyteStreamWriterObject *self) 1702{ 1703 PyObject_GC_UnTrack(self); 1704 ERROR_DECREF(self->errors); 1705 Py_XDECREF(self->stream); 1706 Py_TYPE(self)->tp_free(self); 1707} 1708 1709static struct PyMethodDef mbstreamwriter_methods[] = { 1710 {"write", (PyCFunction)mbstreamwriter_write, 1711 METH_O, NULL}, 1712 {"writelines", (PyCFunction)mbstreamwriter_writelines, 1713 METH_O, NULL}, 1714 {"reset", (PyCFunction)mbstreamwriter_reset, 1715 METH_NOARGS, NULL}, 1716 {NULL, NULL}, 1717}; 1718 1719static PyMemberDef mbstreamwriter_members[] = { 1720 {"stream", T_OBJECT, 1721 offsetof(MultibyteStreamWriterObject, stream), 1722 READONLY, NULL}, 1723 {NULL,} 1724}; 1725 1726static PyTypeObject MultibyteStreamWriter_Type = { 1727 PyVarObject_HEAD_INIT(NULL, 0) 1728 "MultibyteStreamWriter", /* tp_name */ 1729 sizeof(MultibyteStreamWriterObject), /* tp_basicsize */ 1730 0, /* tp_itemsize */ 1731 /* methods */ 1732 (destructor)mbstreamwriter_dealloc, /* tp_dealloc */ 1733 0, /* tp_print */ 1734 0, /* tp_getattr */ 1735 0, /* tp_setattr */ 1736 0, /* tp_compare */ 1737 0, /* tp_repr */ 1738 0, /* tp_as_number */ 1739 0, /* tp_as_sequence */ 1740 0, /* tp_as_mapping */ 1741 0, /* tp_hash */ 1742 0, /* tp_call */ 1743 0, /* tp_str */ 1744 PyObject_GenericGetAttr, /* tp_getattro */ 1745 0, /* tp_setattro */ 1746 0, /* tp_as_buffer */ 1747 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC 1748 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 1749 0, /* tp_doc */ 1750 (traverseproc)mbstreamwriter_traverse, /* tp_traverse */ 1751 0, /* tp_clear */ 1752 0, /* tp_richcompare */ 1753 0, /* tp_weaklistoffset */ 1754 0, /* tp_iter */ 1755 0, /* tp_iterext */ 1756 mbstreamwriter_methods, /* tp_methods */ 1757 mbstreamwriter_members, /* tp_members */ 1758 codecctx_getsets, /* tp_getset */ 1759 0, /* tp_base */ 1760 0, /* tp_dict */ 1761 0, /* tp_descr_get */ 1762 0, /* tp_descr_set */ 1763 0, /* tp_dictoffset */ 1764 mbstreamwriter_init, /* tp_init */ 1765 0, /* tp_alloc */ 1766 mbstreamwriter_new, /* tp_new */ 1767}; 1768 1769 1770/** 1771 * Exposed factory function 1772 */ 1773 1774static PyObject * 1775__create_codec(PyObject *ignore, PyObject *arg) 1776{ 1777 MultibyteCodecObject *self; 1778 MultibyteCodec *codec; 1779 1780 if (!PyCapsule_IsValid(arg, PyMultibyteCodec_CAPSULE_NAME)) { 1781 PyErr_SetString(PyExc_ValueError, "argument type invalid"); 1782 return NULL; 1783 } 1784 1785 codec = PyCapsule_GetPointer(arg, PyMultibyteCodec_CAPSULE_NAME); 1786 if (codec->codecinit != NULL && codec->codecinit(codec->config) != 0) 1787 return NULL; 1788 1789 self = PyObject_New(MultibyteCodecObject, &MultibyteCodec_Type); 1790 if (self == NULL) 1791 return NULL; 1792 self->codec = codec; 1793 1794 return (PyObject *)self; 1795} 1796 1797static struct PyMethodDef __methods[] = { 1798 {"__create_codec", (PyCFunction)__create_codec, METH_O}, 1799 {NULL, NULL}, 1800}; 1801 1802PyMODINIT_FUNC 1803init_multibytecodec(void) 1804{ 1805 int i; 1806 PyObject *m; 1807 PyTypeObject *typelist[] = { 1808 &MultibyteIncrementalEncoder_Type, 1809 &MultibyteIncrementalDecoder_Type, 1810 &MultibyteStreamReader_Type, 1811 &MultibyteStreamWriter_Type, 1812 NULL 1813 }; 1814 1815 if (PyType_Ready(&MultibyteCodec_Type) < 0) 1816 return; 1817 1818 m = Py_InitModule("_multibytecodec", __methods); 1819 if (m == NULL) 1820 return; 1821 1822 for (i = 0; typelist[i] != NULL; i++) { 1823 if (PyType_Ready(typelist[i]) < 0) 1824 return; 1825 Py_INCREF(typelist[i]); 1826 PyModule_AddObject(m, typelist[i]->tp_name, 1827 (PyObject *)typelist[i]); 1828 } 1829 1830 if (PyErr_Occurred()) 1831 Py_FatalError("can't initialize the _multibytecodec module"); 1832} 1833