unicodeobject.h revision 004d64f362eb0bd0d3e2f257b2b7721fecba87af
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4/* 5 6Unicode implementation based on original code by Fredrik Lundh, 7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 8Unicode Integration Proposal (see file Misc/unicode.txt). 9 10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 11 12 13 Original header: 14 -------------------------------------------------------------------- 15 16 * Yet another Unicode string type for Python. This type supports the 17 * 16-bit Basic Multilingual Plane (BMP) only. 18 * 19 * Written by Fredrik Lundh, January 1999. 20 * 21 * Copyright (c) 1999 by Secret Labs AB. 22 * Copyright (c) 1999 by Fredrik Lundh. 23 * 24 * fredrik@pythonware.com 25 * http://www.pythonware.com 26 * 27 * -------------------------------------------------------------------- 28 * This Unicode String Type is 29 * 30 * Copyright (c) 1999 by Secret Labs AB 31 * Copyright (c) 1999 by Fredrik Lundh 32 * 33 * By obtaining, using, and/or copying this software and/or its 34 * associated documentation, you agree that you have read, understood, 35 * and will comply with the following terms and conditions: 36 * 37 * Permission to use, copy, modify, and distribute this software and its 38 * associated documentation for any purpose and without fee is hereby 39 * granted, provided that the above copyright notice appears in all 40 * copies, and that both that copyright notice and this permission notice 41 * appear in supporting documentation, and that the name of Secret Labs 42 * AB or the author not be used in advertising or publicity pertaining to 43 * distribution of the software without specific, written prior 44 * permission. 45 * 46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 53 * -------------------------------------------------------------------- */ 54 55#include "ctype.h" 56 57/* === Internal API ======================================================= */ 58 59/* --- Internal Unicode Format -------------------------------------------- */ 60 61/* Set these flags if the platform has "wchar.h", "wctype.h" and the 62 wchar_t type is a 16-bit unsigned type */ 63/* #define HAVE_WCHAR_H */ 64/* #define HAVE_USABLE_WCHAR_T */ 65 66/* Defaults for various platforms */ 67#ifndef HAVE_USABLE_WCHAR_T 68 69/* Windows has a usable wchar_t type */ 70# if defined(MS_WIN32) 71# define HAVE_USABLE_WCHAR_T 72# endif 73 74#endif 75 76/* If the compiler provides a wchar_t type we try to support it 77 through the interface functions PyUnicode_FromWideChar() and 78 PyUnicode_AsWideChar(). */ 79 80#ifdef HAVE_USABLE_WCHAR_T 81# define HAVE_WCHAR_H 82#endif 83 84#ifdef HAVE_WCHAR_H 85/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 86# ifdef _HAVE_BSDI 87# include <time.h> 88# endif 89# include "wchar.h" 90#endif 91 92#ifdef HAVE_USABLE_WCHAR_T 93 94/* If the compiler defines whcar_t as a 16-bit unsigned type we can 95 use the compiler type directly. Works fine with all modern Windows 96 platforms. */ 97 98typedef wchar_t Py_UNICODE; 99 100#else 101 102/* Use if you have a standard ANSI compiler, without wchar_t support. 103 If a short is not 16 bits on your platform, you have to fix the 104 typedef below, or the module initialization code will complain. */ 105 106typedef unsigned short Py_UNICODE; 107 108#endif 109 110/* --- Internal Unicode Operations ---------------------------------------- */ 111 112/* If you want Python to use the compiler's wctype.h functions instead 113 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or 114 configure Python using --with-ctype-functions. This reduces the 115 interpreter's code size. */ 116 117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) 118 119#include "wctype.h" 120 121#define Py_UNICODE_ISSPACE(ch) iswspace(ch) 122 123#define Py_UNICODE_ISLOWER(ch) iswlower(ch) 124#define Py_UNICODE_ISUPPER(ch) iswupper(ch) 125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 127 128#define Py_UNICODE_TOLOWER(ch) towlower(ch) 129#define Py_UNICODE_TOUPPER(ch) towupper(ch) 130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 131 132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 135 136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 139 140#else 141 142#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) 143 144#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 145#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 146#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 147#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 148 149#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 150#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 151#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 152 153#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 154#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 155#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 156 157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 160 161#endif 162 163#define Py_UNICODE_COPY(target, source, length)\ 164 (memcpy((target), (source), (length)*sizeof(Py_UNICODE))) 165 166#define Py_UNICODE_FILL(target, value, length) do\ 167 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\ 168 while (0) 169 170#define Py_UNICODE_MATCH(string, offset, substring)\ 171 (!memcmp((string)->str + (offset), (substring)->str,\ 172 (substring)->length*sizeof(Py_UNICODE))) 173 174#ifdef __cplusplus 175extern "C" { 176#endif 177 178/* --- Unicode Type ------------------------------------------------------- */ 179 180typedef struct { 181 PyObject_HEAD 182 int length; /* Length of raw Unicode data in buffer */ 183 Py_UNICODE *str; /* Raw Unicode buffer */ 184 long hash; /* Hash value; -1 if not set */ 185 PyObject *utf8str; /* UTF-8 encoded version as Python string, 186 or NULL */ 187} PyUnicodeObject; 188 189extern DL_IMPORT(PyTypeObject) PyUnicode_Type; 190 191#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type)) 192 193/* Fast access macros */ 194#define PyUnicode_GET_SIZE(op) \ 195 (((PyUnicodeObject *)(op))->length) 196#define PyUnicode_GET_DATA_SIZE(op) \ 197 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) 198#define PyUnicode_AS_UNICODE(op) \ 199 (((PyUnicodeObject *)(op))->str) 200#define PyUnicode_AS_DATA(op) \ 201 ((const char *)((PyUnicodeObject *)(op))->str) 202 203/* --- Constants ---------------------------------------------------------- */ 204 205/* This Unicode character will be used as replacement character during 206 decoding if the errors argument is set to "replace". Note: the 207 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 208 Unicode 3.0. */ 209 210#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 211 212/* === Public API ========================================================= */ 213 214/* --- Plain Py_UNICODE --------------------------------------------------- */ 215 216/* Create a Unicode Object from the Py_UNICODE buffer u of the given 217 size. u may be NULL which causes the contents to be undefined. It 218 is the user's responsibility to fill in the needed data. 219 220 The buffer is copied into the new object. */ 221 222extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode( 223 const Py_UNICODE *u, /* Unicode buffer */ 224 int size /* size of buffer */ 225 ); 226 227/* Return a read-only pointer to the Unicode object's internal 228 Py_UNICODE buffer. */ 229 230extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode( 231 PyObject *unicode /* Unicode object */ 232 ); 233 234/* Get the length of the Unicode object. */ 235 236extern DL_IMPORT(int) PyUnicode_GetSize( 237 PyObject *unicode /* Unicode object */ 238 ); 239 240/* Resize an already allocated Unicode object to the new size length. 241 242 *unicode is modified to point to the new (resized) object and 0 243 returned on success. 244 245 This API may only be called by the function which also called the 246 Unicode constructor. The refcount on the object must be 1. Otherwise, 247 an error is returned. 248 249 Error handling is implemented as follows: an exception is set, -1 250 is returned and *unicode left untouched. 251 252*/ 253 254extern DL_IMPORT(int) PyUnicode_Resize( 255 PyObject **unicode, /* Pointer to the Unicode object */ 256 int length /* New length */ 257 ); 258 259/* Coerce obj to an Unicode object and return a reference with 260 *incremented* refcount. 261 262 Coercion is done in the following way: 263 264 1. Unicode objects are passed back as-is with incremented 265 refcount. 266 267 2. String and other char buffer compatible objects are decoded 268 under the assumptions that they contain UTF-8 data. Decoding 269 is done in "strict" mode. 270 271 3. All other objects raise an exception. 272 273 The API returns NULL in case of an error. The caller is responsible 274 for decref'ing the returned objects. 275 276*/ 277 278extern DL_IMPORT(PyObject*) PyUnicode_FromObject( 279 register PyObject *obj /* Object */ 280 ); 281 282/* --- wchar_t support for platforms which support it --------------------- */ 283 284#ifdef HAVE_WCHAR_H 285 286/* Create a Unicode Object from the whcar_t buffer w of the given 287 size. 288 289 The buffer is copied into the new object. */ 290 291extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar( 292 register const wchar_t *w, /* wchar_t buffer */ 293 int size /* size of buffer */ 294 ); 295 296/* Copies the Unicode Object contents into the whcar_t buffer w. At 297 most size wchar_t characters are copied. 298 299 Returns the number of wchar_t characters copied or -1 in case of an 300 error. */ 301 302extern DL_IMPORT(int) PyUnicode_AsWideChar( 303 PyUnicodeObject *unicode, /* Unicode object */ 304 register wchar_t *w, /* wchar_t buffer */ 305 int size /* size of buffer */ 306 ); 307 308#endif 309 310/* === Builtin Codecs ===================================================== 311 312 Many of these APIs take two arguments encoding and errors. These 313 parameters encoding and errors have the same semantics as the ones 314 of the builtin unicode() API. 315 316 Setting encoding to NULL causes the default encoding to be used 317 which is UTF-8. 318 319 Error handling is set by errors which may also be set to NULL 320 meaning to use the default handling defined for the codec. Default 321 error handling for all builtin codecs is "strict" (ValueErrors are 322 raised). 323 324 The codecs all use a similar interface. Only deviation from the 325 generic ones are documented. 326 327*/ 328 329/* --- Generic Codecs ----------------------------------------------------- */ 330 331/* Create a Unicode object by decoding the encoded string s of the 332 given size. */ 333 334extern DL_IMPORT(PyObject*) PyUnicode_Decode( 335 const char *s, /* encoded string */ 336 int size, /* size of buffer */ 337 const char *encoding, /* encoding */ 338 const char *errors /* error handling */ 339 ); 340 341/* Encodes a Py_UNICODE buffer of the given size and returns a 342 Python string object. */ 343 344extern DL_IMPORT(PyObject*) PyUnicode_Encode( 345 const Py_UNICODE *s, /* Unicode char buffer */ 346 int size, /* number of Py_UNICODE chars to encode */ 347 const char *encoding, /* encoding */ 348 const char *errors /* error handling */ 349 ); 350 351/* Encodes a Unicode object and returns the result as Python string 352 object. */ 353 354extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString( 355 PyObject *unicode, /* Unicode object */ 356 const char *encoding, /* encoding */ 357 const char *errors /* error handling */ 358 ); 359 360/* --- UTF-8 Codecs ------------------------------------------------------- */ 361 362extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8( 363 const char *string, /* UTF-8 encoded string */ 364 int length, /* size of string */ 365 const char *errors /* error handling */ 366 ); 367 368extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String( 369 PyObject *unicode /* Unicode object */ 370 ); 371 372extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( 373 const Py_UNICODE *data, /* Unicode char buffer */ 374 int length, /* number of Py_UNICODE chars to encode */ 375 const char *errors /* error handling */ 376 ); 377 378/* --- UTF-16 Codecs ------------------------------------------------------ */ 379 380/* Decodes length bytes from a UTF-16 encoded buffer string and returns 381 the corresponding Unicode object. 382 383 errors (if non-NULL) defines the error handling. It defaults 384 to "strict". 385 386 If byteorder is non-NULL, the decoder starts decoding using the 387 given byte order: 388 389 *byteorder == -1: little endian 390 *byteorder == 0: native order 391 *byteorder == 1: big endian 392 393 and then switches according to all BOM marks it finds in the input 394 data. BOM marks are not copied into the resulting Unicode string. 395 After completion, *byteorder is set to the current byte order at 396 the end of input data. 397 398 If byteorder is NULL, the codec starts in native order mode. 399 400*/ 401 402extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16( 403 const char *string, /* UTF-16 encoded string */ 404 int length, /* size of string */ 405 const char *errors, /* error handling */ 406 int *byteorder /* pointer to byteorder to use 407 0=native;-1=LE,1=BE; updated on 408 exit */ 409 ); 410 411/* Returns a Python string using the UTF-16 encoding in native byte 412 order. The string always starts with a BOM mark. */ 413 414extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String( 415 PyObject *unicode /* Unicode object */ 416 ); 417 418/* Returns a Python string object holding the UTF-16 encoded value of 419 the Unicode data. 420 421 If byteorder is not 0, output is written according to the following 422 byte order: 423 424 byteorder == -1: little endian 425 byteorder == 0: native byte order (writes a BOM mark) 426 byteorder == 1: big endian 427 428 If byteorder is 0, the output string will always start with the 429 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 430 prepended. 431 432 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 433 UCS-2. This trick makes it possible to add full UTF-16 capabilities 434 at a later point without comprimising the APIs. 435 436*/ 437 438extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16( 439 const Py_UNICODE *data, /* Unicode char buffer */ 440 int length, /* number of Py_UNICODE chars to encode */ 441 const char *errors, /* error handling */ 442 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 443 ); 444 445/* --- Unicode-Escape Codecs ---------------------------------------------- */ 446 447extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape( 448 const char *string, /* Unicode-Escape encoded string */ 449 int length, /* size of string */ 450 const char *errors /* error handling */ 451 ); 452 453extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString( 454 PyObject *unicode /* Unicode object */ 455 ); 456 457extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape( 458 const Py_UNICODE *data, /* Unicode char buffer */ 459 int length /* Number of Py_UNICODE chars to encode */ 460 ); 461 462/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 463 464extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 465 const char *string, /* Raw-Unicode-Escape encoded string */ 466 int length, /* size of string */ 467 const char *errors /* error handling */ 468 ); 469 470extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 471 PyObject *unicode /* Unicode object */ 472 ); 473 474extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 475 const Py_UNICODE *data, /* Unicode char buffer */ 476 int length /* Number of Py_UNICODE chars to encode */ 477 ); 478 479/* --- Latin-1 Codecs ----------------------------------------------------- 480 481 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 482 483*/ 484 485extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1( 486 const char *string, /* Latin-1 encoded string */ 487 int length, /* size of string */ 488 const char *errors /* error handling */ 489 ); 490 491extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String( 492 PyObject *unicode /* Unicode object */ 493 ); 494 495extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1( 496 const Py_UNICODE *data, /* Unicode char buffer */ 497 int length, /* Number of Py_UNICODE chars to encode */ 498 const char *errors /* error handling */ 499 ); 500 501/* --- ASCII Codecs ------------------------------------------------------- 502 503 Only 7-bit ASCII data is excepted. All other codes generate errors. 504 505*/ 506 507extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII( 508 const char *string, /* ASCII encoded string */ 509 int length, /* size of string */ 510 const char *errors /* error handling */ 511 ); 512 513extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString( 514 PyObject *unicode /* Unicode object */ 515 ); 516 517extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII( 518 const Py_UNICODE *data, /* Unicode char buffer */ 519 int length, /* Number of Py_UNICODE chars to encode */ 520 const char *errors /* error handling */ 521 ); 522 523/* --- Character Map Codecs ----------------------------------------------- 524 525 This codec uses mappings to encode and decode characters. 526 527 Decoding mappings must map single string characters to single 528 Unicode characters, integers (which are then interpreted as Unicode 529 ordinals) or None (meaning "undefined mapping" and causing an 530 error). 531 532 Encoding mappings must map single Unicode characters to single 533 string characters, integers (which are then interpreted as Latin-1 534 ordinals) or None (meaning "undefined mapping" and causing an 535 error). 536 537 If a character lookup fails with a LookupError, the character is 538 copied as-is meaning that its ordinal value will be interpreted as 539 Unicode or Latin-1 ordinal resp. Because of this mappings only need 540 to contain those mappings which map characters to different code 541 points. 542 543*/ 544 545extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap( 546 const char *string, /* Encoded string */ 547 int length, /* size of string */ 548 PyObject *mapping, /* character mapping 549 (char ordinal -> unicode ordinal) */ 550 const char *errors /* error handling */ 551 ); 552 553extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString( 554 PyObject *unicode, /* Unicode object */ 555 PyObject *mapping /* character mapping 556 (unicode ordinal -> char ordinal) */ 557 ); 558 559extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap( 560 const Py_UNICODE *data, /* Unicode char buffer */ 561 int length, /* Number of Py_UNICODE chars to encode */ 562 PyObject *mapping, /* character mapping 563 (unicode ordinal -> char ordinal) */ 564 const char *errors /* error handling */ 565 ); 566 567/* Translate a Py_UNICODE buffer of the given length by applying a 568 character mapping table to it and return the resulting Unicode 569 object. 570 571 The mapping table must map Unicode ordinal integers to Unicode 572 ordinal integers or None (causing deletion of the character). 573 574 Mapping tables may be dictionaries or sequences. Unmapped character 575 ordinals (ones which cause a LookupError) are left untouched and 576 are copied as-is. 577 578*/ 579 580extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap( 581 const Py_UNICODE *data, /* Unicode char buffer */ 582 int length, /* Number of Py_UNICODE chars to encode */ 583 PyObject *table, /* Translate table */ 584 const char *errors /* error handling */ 585 ); 586 587#ifdef MS_WIN32 588 589/* --- MBCS codecs for Windows -------------------------------------------- */ 590 591extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS( 592 const char *string, /* MBCS encoded string */ 593 int length, /* size of string */ 594 const char *errors /* error handling */ 595 ); 596 597extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString( 598 PyObject *unicode /* Unicode object */ 599 ); 600 601extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS( 602 const Py_UNICODE *data, /* Unicode char buffer */ 603 int length, /* Number of Py_UNICODE chars to encode */ 604 const char *errors /* error handling */ 605 ); 606 607#endif /* MS_WIN32 */ 608 609/* --- Decimal Encoder ---------------------------------------------------- */ 610 611/* Takes a Unicode string holding a decimal value and writes it into 612 an output buffer using standard ASCII digit codes. 613 614 The output buffer has to provide at least length+1 bytes of storage 615 area. The output string is 0-terminated. 616 617 The encoder converts whitespace to ' ', decimal characters to their 618 corresponding ASCII digit and all other Latin-1 characters except 619 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 620 are treated as errors. This includes embedded NULL bytes. 621 622 Error handling is defined by the errors argument: 623 624 NULL or "strict": raise a ValueError 625 "ignore": ignore the wrong characters (these are not copied to the 626 output buffer) 627 "replace": replaces illegal characters with '?' 628 629 Returns 0 on success, -1 on failure. 630 631*/ 632 633extern DL_IMPORT(int) PyUnicode_EncodeDecimal( 634 Py_UNICODE *s, /* Unicode buffer */ 635 int length, /* Number of Py_UNICODE chars to encode */ 636 char *output, /* Output buffer; must have size >= length */ 637 const char *errors /* error handling */ 638 ); 639 640/* --- Methods & Slots ---------------------------------------------------- 641 642 These are capable of handling Unicode objects and strings on input 643 (we refer to them as strings in the descriptions) and return 644 Unicode objects or integers as apporpriate. */ 645 646/* Concat two strings giving a new Unicode string. */ 647 648extern DL_IMPORT(PyObject*) PyUnicode_Concat( 649 PyObject *left, /* Left string */ 650 PyObject *right /* Right string */ 651 ); 652 653/* Split a string giving a list of Unicode strings. 654 655 If sep is NULL, splitting will be done at all whitespace 656 substrings. Otherwise, splits occur at the given separator. 657 658 At most maxsplit splits will be done. If negative, no limit is set. 659 660 Separators are not included in the resulting list. 661 662*/ 663 664extern DL_IMPORT(PyObject*) PyUnicode_Split( 665 PyObject *s, /* String to split */ 666 PyObject *sep, /* String separator */ 667 int maxsplit /* Maxsplit count */ 668 ); 669 670/* Dito, but split at line breaks. 671 672 CRLF is considered to be one line break. Line breaks are not 673 included in the resulting list. */ 674 675extern DL_IMPORT(PyObject*) PyUnicode_Splitlines( 676 PyObject *s, /* String to split */ 677 int keepends /* If true, line end markers are included */ 678 ); 679 680/* Translate a string by applying a character mapping table to it and 681 return the resulting Unicode object. 682 683 The mapping table must map Unicode ordinal integers to Unicode 684 ordinal integers or None (causing deletion of the character). 685 686 Mapping tables may be dictionaries or sequences. Unmapped character 687 ordinals (ones which cause a LookupError) are left untouched and 688 are copied as-is. 689 690*/ 691 692extern DL_IMPORT(PyObject *) PyUnicode_Translate( 693 PyObject *str, /* String */ 694 PyObject *table, /* Translate table */ 695 const char *errors /* error handling */ 696 ); 697 698/* Join a sequence of strings using the given separator and return 699 the resulting Unicode string. */ 700 701extern DL_IMPORT(PyObject*) PyUnicode_Join( 702 PyObject *separator, /* Separator string */ 703 PyObject *seq /* Sequence object */ 704 ); 705 706/* Return 1 if substr matches str[start:end] at the given tail end, 0 707 otherwise. */ 708 709extern DL_IMPORT(int) PyUnicode_Tailmatch( 710 PyObject *str, /* String */ 711 PyObject *substr, /* Prefix or Suffix string */ 712 int start, /* Start index */ 713 int end, /* Stop index */ 714 int direction /* Tail end: -1 prefix, +1 suffix */ 715 ); 716 717/* Return the first position of substr in str[start:end] using the 718 given search direction or -1 if not found. */ 719 720extern DL_IMPORT(int) PyUnicode_Find( 721 PyObject *str, /* String */ 722 PyObject *substr, /* Substring to find */ 723 int start, /* Start index */ 724 int end, /* Stop index */ 725 int direction /* Find direction: +1 forward, -1 backward */ 726 ); 727 728/* Count the number of occurrences of substr in str[start:end]. */ 729 730extern DL_IMPORT(int) PyUnicode_Count( 731 PyObject *str, /* String */ 732 PyObject *substr, /* Substring to count */ 733 int start, /* Start index */ 734 int end /* Stop index */ 735 ); 736 737/* Replace at most maxcount occurrences of substr in str with replstr 738 and return the resulting Unicode object. */ 739 740extern DL_IMPORT(PyObject *) PyUnicode_Replace( 741 PyObject *str, /* String */ 742 PyObject *substr, /* Substring to find */ 743 PyObject *replstr, /* Substring to replace */ 744 int maxcount /* Max. number of replacements to apply; 745 -1 = all */ 746 ); 747 748/* Compare two strings and return -1, 0, 1 for less than, equal, 749 greater than resp. */ 750 751extern DL_IMPORT(int) PyUnicode_Compare( 752 PyObject *left, /* Left string */ 753 PyObject *right /* Right string */ 754 ); 755 756/* Apply a argument tuple or dictionar to a format string and return 757 the resulting Unicode string. */ 758 759extern DL_IMPORT(PyObject *) PyUnicode_Format( 760 PyObject *format, /* Format string */ 761 PyObject *args /* Argument tuple or dictionary */ 762 ); 763 764/* Checks whether element is contained in container and return 1/0 765 accordingly. 766 767 element has to coerce to an one element Unicode string. -1 is 768 returned in case of an error. */ 769 770extern DL_IMPORT(int) PyUnicode_Contains( 771 PyObject *container, /* Container string */ 772 PyObject *element /* Element string */ 773 ); 774 775/* === Characters Type APIs =============================================== */ 776 777/* These should not be used directly. Use the Py_UNICODE_IS* and 778 Py_UNICODE_TO* macros instead. 779 780 These APIs are implemented in Objects/unicodectype.c. 781 782*/ 783 784extern DL_IMPORT(int) _PyUnicode_IsLowercase( 785 register const Py_UNICODE ch /* Unicode character */ 786 ); 787 788extern DL_IMPORT(int) _PyUnicode_IsUppercase( 789 register const Py_UNICODE ch /* Unicode character */ 790 ); 791 792extern DL_IMPORT(int) _PyUnicode_IsTitlecase( 793 register const Py_UNICODE ch /* Unicode character */ 794 ); 795 796extern DL_IMPORT(int) _PyUnicode_IsWhitespace( 797 register const Py_UNICODE ch /* Unicode character */ 798 ); 799 800extern DL_IMPORT(int) _PyUnicode_IsLinebreak( 801 register const Py_UNICODE ch /* Unicode character */ 802 ); 803 804extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase( 805 register const Py_UNICODE ch /* Unicode character */ 806 ); 807 808extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase( 809 register const Py_UNICODE ch /* Unicode character */ 810 ); 811 812extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase( 813 register const Py_UNICODE ch /* Unicode character */ 814 ); 815 816extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit( 817 register const Py_UNICODE ch /* Unicode character */ 818 ); 819 820extern DL_IMPORT(int) _PyUnicode_ToDigit( 821 register const Py_UNICODE ch /* Unicode character */ 822 ); 823 824extern DL_IMPORT(double) _PyUnicode_ToNumeric( 825 register const Py_UNICODE ch /* Unicode character */ 826 ); 827 828extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit( 829 register const Py_UNICODE ch /* Unicode character */ 830 ); 831 832extern DL_IMPORT(int) _PyUnicode_IsDigit( 833 register const Py_UNICODE ch /* Unicode character */ 834 ); 835 836extern DL_IMPORT(int) _PyUnicode_IsNumeric( 837 register const Py_UNICODE ch /* Unicode character */ 838 ); 839 840#ifdef __cplusplus 841} 842#endif 843#endif /* !Py_UNICODEOBJECT_H */ 844