unicodeobject.h revision f03e74126e5702edab33148140e84d21471424ce
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4/* 5 6Unicode implementation based on original code by Fredrik Lundh, 7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 8Unicode Integration Proposal (see file Misc/unicode.txt). 9 10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 11 12 13 Original header: 14 -------------------------------------------------------------------- 15 16 * Yet another Unicode string type for Python. This type supports the 17 * 16-bit Basic Multilingual Plane (BMP) only. 18 * 19 * Written by Fredrik Lundh, January 1999. 20 * 21 * Copyright (c) 1999 by Secret Labs AB. 22 * Copyright (c) 1999 by Fredrik Lundh. 23 * 24 * fredrik@pythonware.com 25 * http://www.pythonware.com 26 * 27 * -------------------------------------------------------------------- 28 * This Unicode String Type is 29 * 30 * Copyright (c) 1999 by Secret Labs AB 31 * Copyright (c) 1999 by Fredrik Lundh 32 * 33 * By obtaining, using, and/or copying this software and/or its 34 * associated documentation, you agree that you have read, understood, 35 * and will comply with the following terms and conditions: 36 * 37 * Permission to use, copy, modify, and distribute this software and its 38 * associated documentation for any purpose and without fee is hereby 39 * granted, provided that the above copyright notice appears in all 40 * copies, and that both that copyright notice and this permission notice 41 * appear in supporting documentation, and that the name of Secret Labs 42 * AB or the author not be used in advertising or publicity pertaining to 43 * distribution of the software without specific, written prior 44 * permission. 45 * 46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 53 * -------------------------------------------------------------------- */ 54 55#include "ctype.h" 56 57/* === Internal API ======================================================= */ 58 59/* --- Internal Unicode Format -------------------------------------------- */ 60 61/* Set these flags if the platform has "wchar.h", "wctype.h" and the 62 wchar_t type is a 16-bit unsigned type */ 63/* #define HAVE_WCHAR_H */ 64/* #define HAVE_USABLE_WCHAR_T */ 65 66/* Defaults for various platforms */ 67#ifndef HAVE_USABLE_WCHAR_T 68 69/* Windows has a usable wchar_t type */ 70# if defined(MS_WIN32) 71# define HAVE_USABLE_WCHAR_T 72# endif 73 74#endif 75 76/* If the compiler provides a wchar_t type we try to support it 77 through the interface functions PyUnicode_FromWideChar() and 78 PyUnicode_AsWideChar(). */ 79 80#ifdef HAVE_USABLE_WCHAR_T 81# define HAVE_WCHAR_H 82#endif 83 84#ifdef HAVE_WCHAR_H 85/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 86# ifdef _HAVE_BSDI 87# include <time.h> 88# endif 89# include "wchar.h" 90#endif 91 92#ifdef HAVE_USABLE_WCHAR_T 93 94/* If the compiler defines whcar_t as a 16-bit unsigned type we can 95 use the compiler type directly. Works fine with all modern Windows 96 platforms. */ 97 98typedef wchar_t Py_UNICODE; 99 100#else 101 102/* Use if you have a standard ANSI compiler, without wchar_t support. 103 If a short is not 16 bits on your platform, you have to fix the 104 typedef below, or the module initialization code will complain. */ 105 106typedef unsigned short Py_UNICODE; 107 108#endif 109 110/* --- Internal Unicode Operations ---------------------------------------- */ 111 112/* If you want Python to use the compiler's wctype.h functions instead 113 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or 114 configure Python using --with-ctype-functions. This reduces the 115 interpreter's code size. */ 116 117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) 118 119#include "wctype.h" 120 121#define Py_UNICODE_ISSPACE(ch) iswspace(ch) 122 123#define Py_UNICODE_ISLOWER(ch) iswlower(ch) 124#define Py_UNICODE_ISUPPER(ch) iswupper(ch) 125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 127 128#define Py_UNICODE_TOLOWER(ch) towlower(ch) 129#define Py_UNICODE_TOUPPER(ch) towupper(ch) 130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 131 132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 135 136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 139 140#define Py_UNICODE_ISALPHA(ch) iswalpha(ch) 141 142#else 143 144#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) 145 146#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 147#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 148#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 149#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 150 151#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 152#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 153#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 154 155#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 156#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 157#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 158 159#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 160#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 161#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 162 163#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 164 165#endif 166 167#define Py_UNICODE_ISALNUM(ch) \ 168 (Py_UNICODE_ISALPHA(ch) || \ 169 Py_UNICODE_ISDECIMAL(ch) || \ 170 Py_UNICODE_ISDIGIT(ch) || \ 171 Py_UNICODE_ISNUMERIC(ch)) 172 173#define Py_UNICODE_COPY(target, source, length)\ 174 (memcpy((target), (source), (length)*sizeof(Py_UNICODE))) 175 176#define Py_UNICODE_FILL(target, value, length) do\ 177 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\ 178 while (0) 179 180#define Py_UNICODE_MATCH(string, offset, substring)\ 181 ((*((string)->str + (offset)) == *((substring)->str)) &&\ 182 !memcmp((string)->str + (offset), (substring)->str,\ 183 (substring)->length*sizeof(Py_UNICODE))) 184 185#ifdef __cplusplus 186extern "C" { 187#endif 188 189/* --- Unicode Type ------------------------------------------------------- */ 190 191typedef struct { 192 PyObject_HEAD 193 int length; /* Length of raw Unicode data in buffer */ 194 Py_UNICODE *str; /* Raw Unicode buffer */ 195 long hash; /* Hash value; -1 if not set */ 196 PyObject *utf8str; /* UTF-8 encoded version as Python string, 197 or NULL */ 198} PyUnicodeObject; 199 200extern DL_IMPORT(PyTypeObject) PyUnicode_Type; 201 202#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type)) 203 204/* Fast access macros */ 205#define PyUnicode_GET_SIZE(op) \ 206 (((PyUnicodeObject *)(op))->length) 207#define PyUnicode_GET_DATA_SIZE(op) \ 208 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) 209#define PyUnicode_AS_UNICODE(op) \ 210 (((PyUnicodeObject *)(op))->str) 211#define PyUnicode_AS_DATA(op) \ 212 ((const char *)((PyUnicodeObject *)(op))->str) 213 214/* --- Constants ---------------------------------------------------------- */ 215 216/* This Unicode character will be used as replacement character during 217 decoding if the errors argument is set to "replace". Note: the 218 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 219 Unicode 3.0. */ 220 221#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 222 223/* === Public API ========================================================= */ 224 225/* --- Plain Py_UNICODE --------------------------------------------------- */ 226 227/* Create a Unicode Object from the Py_UNICODE buffer u of the given 228 size. u may be NULL which causes the contents to be undefined. It 229 is the user's responsibility to fill in the needed data. 230 231 The buffer is copied into the new object. */ 232 233extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode( 234 const Py_UNICODE *u, /* Unicode buffer */ 235 int size /* size of buffer */ 236 ); 237 238/* Return a read-only pointer to the Unicode object's internal 239 Py_UNICODE buffer. */ 240 241extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode( 242 PyObject *unicode /* Unicode object */ 243 ); 244 245/* Get the length of the Unicode object. */ 246 247extern DL_IMPORT(int) PyUnicode_GetSize( 248 PyObject *unicode /* Unicode object */ 249 ); 250 251/* Resize an already allocated Unicode object to the new size length. 252 253 *unicode is modified to point to the new (resized) object and 0 254 returned on success. 255 256 This API may only be called by the function which also called the 257 Unicode constructor. The refcount on the object must be 1. Otherwise, 258 an error is returned. 259 260 Error handling is implemented as follows: an exception is set, -1 261 is returned and *unicode left untouched. 262 263*/ 264 265extern DL_IMPORT(int) PyUnicode_Resize( 266 PyObject **unicode, /* Pointer to the Unicode object */ 267 int length /* New length */ 268 ); 269 270/* Coerce obj to an Unicode object and return a reference with 271 *incremented* refcount. 272 273 Coercion is done in the following way: 274 275 1. Unicode objects are passed back as-is with incremented 276 refcount. 277 278 2. String and other char buffer compatible objects are decoded 279 under the assumptions that they contain data using the current 280 default encoding. Decoding is done in "strict" mode. 281 282 3. All other objects raise an exception. 283 284 The API returns NULL in case of an error. The caller is responsible 285 for decref'ing the returned objects. 286 287*/ 288 289extern DL_IMPORT(PyObject*) PyUnicode_FromObject( 290 register PyObject *obj /* Object */ 291 ); 292 293/* --- wchar_t support for platforms which support it --------------------- */ 294 295#ifdef HAVE_WCHAR_H 296 297/* Create a Unicode Object from the whcar_t buffer w of the given 298 size. 299 300 The buffer is copied into the new object. */ 301 302extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar( 303 register const wchar_t *w, /* wchar_t buffer */ 304 int size /* size of buffer */ 305 ); 306 307/* Copies the Unicode Object contents into the whcar_t buffer w. At 308 most size wchar_t characters are copied. 309 310 Returns the number of wchar_t characters copied or -1 in case of an 311 error. */ 312 313extern DL_IMPORT(int) PyUnicode_AsWideChar( 314 PyUnicodeObject *unicode, /* Unicode object */ 315 register wchar_t *w, /* wchar_t buffer */ 316 int size /* size of buffer */ 317 ); 318 319#endif 320 321/* === Builtin Codecs ===================================================== 322 323 Many of these APIs take two arguments encoding and errors. These 324 parameters encoding and errors have the same semantics as the ones 325 of the builtin unicode() API. 326 327 Setting encoding to NULL causes the default encoding to be used. 328 329 Error handling is set by errors which may also be set to NULL 330 meaning to use the default handling defined for the codec. Default 331 error handling for all builtin codecs is "strict" (ValueErrors are 332 raised). 333 334 The codecs all use a similar interface. Only deviation from the 335 generic ones are documented. 336 337*/ 338 339/* --- Manage the default encoding ---------------------------------------- */ 340 341/* Returns the currently active default encoding. 342 343 The default encoding is currently implemented as run-time settable 344 process global. This may change in future versions of the 345 interpreter to become a parameter which is managed on a per-thread 346 basis. 347 348 */ 349 350extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(); 351 352/* Sets the currently active default encoding. 353 354 Returns 0 on success, -1 in case of an error. 355 356 */ 357 358extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding( 359 const char *encoding /* Encoding name in standard form */ 360 ); 361 362/* --- Generic Codecs ----------------------------------------------------- */ 363 364/* Create a Unicode object by decoding the encoded string s of the 365 given size. */ 366 367extern DL_IMPORT(PyObject*) PyUnicode_Decode( 368 const char *s, /* encoded string */ 369 int size, /* size of buffer */ 370 const char *encoding, /* encoding */ 371 const char *errors /* error handling */ 372 ); 373 374/* Encodes a Py_UNICODE buffer of the given size and returns a 375 Python string object. */ 376 377extern DL_IMPORT(PyObject*) PyUnicode_Encode( 378 const Py_UNICODE *s, /* Unicode char buffer */ 379 int size, /* number of Py_UNICODE chars to encode */ 380 const char *encoding, /* encoding */ 381 const char *errors /* error handling */ 382 ); 383 384/* Encodes a Unicode object and returns the result as Python string 385 object. */ 386 387extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString( 388 PyObject *unicode, /* Unicode object */ 389 const char *encoding, /* encoding */ 390 const char *errors /* error handling */ 391 ); 392 393/* --- UTF-8 Codecs ------------------------------------------------------- */ 394 395extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8( 396 const char *string, /* UTF-8 encoded string */ 397 int length, /* size of string */ 398 const char *errors /* error handling */ 399 ); 400 401extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String( 402 PyObject *unicode /* Unicode object */ 403 ); 404 405extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( 406 const Py_UNICODE *data, /* Unicode char buffer */ 407 int length, /* number of Py_UNICODE chars to encode */ 408 const char *errors /* error handling */ 409 ); 410 411/* --- UTF-16 Codecs ------------------------------------------------------ */ 412 413/* Decodes length bytes from a UTF-16 encoded buffer string and returns 414 the corresponding Unicode object. 415 416 errors (if non-NULL) defines the error handling. It defaults 417 to "strict". 418 419 If byteorder is non-NULL, the decoder starts decoding using the 420 given byte order: 421 422 *byteorder == -1: little endian 423 *byteorder == 0: native order 424 *byteorder == 1: big endian 425 426 and then switches according to all BOM marks it finds in the input 427 data. BOM marks are not copied into the resulting Unicode string. 428 After completion, *byteorder is set to the current byte order at 429 the end of input data. 430 431 If byteorder is NULL, the codec starts in native order mode. 432 433*/ 434 435extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16( 436 const char *string, /* UTF-16 encoded string */ 437 int length, /* size of string */ 438 const char *errors, /* error handling */ 439 int *byteorder /* pointer to byteorder to use 440 0=native;-1=LE,1=BE; updated on 441 exit */ 442 ); 443 444/* Returns a Python string using the UTF-16 encoding in native byte 445 order. The string always starts with a BOM mark. */ 446 447extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String( 448 PyObject *unicode /* Unicode object */ 449 ); 450 451/* Returns a Python string object holding the UTF-16 encoded value of 452 the Unicode data. 453 454 If byteorder is not 0, output is written according to the following 455 byte order: 456 457 byteorder == -1: little endian 458 byteorder == 0: native byte order (writes a BOM mark) 459 byteorder == 1: big endian 460 461 If byteorder is 0, the output string will always start with the 462 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 463 prepended. 464 465 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 466 UCS-2. This trick makes it possible to add full UTF-16 capabilities 467 at a later point without comprimising the APIs. 468 469*/ 470 471extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16( 472 const Py_UNICODE *data, /* Unicode char buffer */ 473 int length, /* number of Py_UNICODE chars to encode */ 474 const char *errors, /* error handling */ 475 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 476 ); 477 478/* --- Unicode-Escape Codecs ---------------------------------------------- */ 479 480extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape( 481 const char *string, /* Unicode-Escape encoded string */ 482 int length, /* size of string */ 483 const char *errors /* error handling */ 484 ); 485 486extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString( 487 PyObject *unicode /* Unicode object */ 488 ); 489 490extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape( 491 const Py_UNICODE *data, /* Unicode char buffer */ 492 int length /* Number of Py_UNICODE chars to encode */ 493 ); 494 495/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 496 497extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 498 const char *string, /* Raw-Unicode-Escape encoded string */ 499 int length, /* size of string */ 500 const char *errors /* error handling */ 501 ); 502 503extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 504 PyObject *unicode /* Unicode object */ 505 ); 506 507extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 508 const Py_UNICODE *data, /* Unicode char buffer */ 509 int length /* Number of Py_UNICODE chars to encode */ 510 ); 511 512/* --- Latin-1 Codecs ----------------------------------------------------- 513 514 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 515 516*/ 517 518extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1( 519 const char *string, /* Latin-1 encoded string */ 520 int length, /* size of string */ 521 const char *errors /* error handling */ 522 ); 523 524extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String( 525 PyObject *unicode /* Unicode object */ 526 ); 527 528extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1( 529 const Py_UNICODE *data, /* Unicode char buffer */ 530 int length, /* Number of Py_UNICODE chars to encode */ 531 const char *errors /* error handling */ 532 ); 533 534/* --- ASCII Codecs ------------------------------------------------------- 535 536 Only 7-bit ASCII data is excepted. All other codes generate errors. 537 538*/ 539 540extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII( 541 const char *string, /* ASCII encoded string */ 542 int length, /* size of string */ 543 const char *errors /* error handling */ 544 ); 545 546extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString( 547 PyObject *unicode /* Unicode object */ 548 ); 549 550extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII( 551 const Py_UNICODE *data, /* Unicode char buffer */ 552 int length, /* Number of Py_UNICODE chars to encode */ 553 const char *errors /* error handling */ 554 ); 555 556/* --- Character Map Codecs ----------------------------------------------- 557 558 This codec uses mappings to encode and decode characters. 559 560 Decoding mappings must map single string characters to single 561 Unicode characters, integers (which are then interpreted as Unicode 562 ordinals) or None (meaning "undefined mapping" and causing an 563 error). 564 565 Encoding mappings must map single Unicode characters to single 566 string characters, integers (which are then interpreted as Latin-1 567 ordinals) or None (meaning "undefined mapping" and causing an 568 error). 569 570 If a character lookup fails with a LookupError, the character is 571 copied as-is meaning that its ordinal value will be interpreted as 572 Unicode or Latin-1 ordinal resp. Because of this mappings only need 573 to contain those mappings which map characters to different code 574 points. 575 576*/ 577 578extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap( 579 const char *string, /* Encoded string */ 580 int length, /* size of string */ 581 PyObject *mapping, /* character mapping 582 (char ordinal -> unicode ordinal) */ 583 const char *errors /* error handling */ 584 ); 585 586extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString( 587 PyObject *unicode, /* Unicode object */ 588 PyObject *mapping /* character mapping 589 (unicode ordinal -> char ordinal) */ 590 ); 591 592extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap( 593 const Py_UNICODE *data, /* Unicode char buffer */ 594 int length, /* Number of Py_UNICODE chars to encode */ 595 PyObject *mapping, /* character mapping 596 (unicode ordinal -> char ordinal) */ 597 const char *errors /* error handling */ 598 ); 599 600/* Translate a Py_UNICODE buffer of the given length by applying a 601 character mapping table to it and return the resulting Unicode 602 object. 603 604 The mapping table must map Unicode ordinal integers to Unicode 605 ordinal integers or None (causing deletion of the character). 606 607 Mapping tables may be dictionaries or sequences. Unmapped character 608 ordinals (ones which cause a LookupError) are left untouched and 609 are copied as-is. 610 611*/ 612 613extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap( 614 const Py_UNICODE *data, /* Unicode char buffer */ 615 int length, /* Number of Py_UNICODE chars to encode */ 616 PyObject *table, /* Translate table */ 617 const char *errors /* error handling */ 618 ); 619 620#ifdef MS_WIN32 621 622/* --- MBCS codecs for Windows -------------------------------------------- */ 623 624extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS( 625 const char *string, /* MBCS encoded string */ 626 int length, /* size of string */ 627 const char *errors /* error handling */ 628 ); 629 630extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString( 631 PyObject *unicode /* Unicode object */ 632 ); 633 634extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS( 635 const Py_UNICODE *data, /* Unicode char buffer */ 636 int length, /* Number of Py_UNICODE chars to encode */ 637 const char *errors /* error handling */ 638 ); 639 640#endif /* MS_WIN32 */ 641 642/* --- Decimal Encoder ---------------------------------------------------- */ 643 644/* Takes a Unicode string holding a decimal value and writes it into 645 an output buffer using standard ASCII digit codes. 646 647 The output buffer has to provide at least length+1 bytes of storage 648 area. The output string is 0-terminated. 649 650 The encoder converts whitespace to ' ', decimal characters to their 651 corresponding ASCII digit and all other Latin-1 characters except 652 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 653 are treated as errors. This includes embedded NULL bytes. 654 655 Error handling is defined by the errors argument: 656 657 NULL or "strict": raise a ValueError 658 "ignore": ignore the wrong characters (these are not copied to the 659 output buffer) 660 "replace": replaces illegal characters with '?' 661 662 Returns 0 on success, -1 on failure. 663 664*/ 665 666extern DL_IMPORT(int) PyUnicode_EncodeDecimal( 667 Py_UNICODE *s, /* Unicode buffer */ 668 int length, /* Number of Py_UNICODE chars to encode */ 669 char *output, /* Output buffer; must have size >= length */ 670 const char *errors /* error handling */ 671 ); 672 673/* --- Methods & Slots ---------------------------------------------------- 674 675 These are capable of handling Unicode objects and strings on input 676 (we refer to them as strings in the descriptions) and return 677 Unicode objects or integers as apporpriate. */ 678 679/* Concat two strings giving a new Unicode string. */ 680 681extern DL_IMPORT(PyObject*) PyUnicode_Concat( 682 PyObject *left, /* Left string */ 683 PyObject *right /* Right string */ 684 ); 685 686/* Split a string giving a list of Unicode strings. 687 688 If sep is NULL, splitting will be done at all whitespace 689 substrings. Otherwise, splits occur at the given separator. 690 691 At most maxsplit splits will be done. If negative, no limit is set. 692 693 Separators are not included in the resulting list. 694 695*/ 696 697extern DL_IMPORT(PyObject*) PyUnicode_Split( 698 PyObject *s, /* String to split */ 699 PyObject *sep, /* String separator */ 700 int maxsplit /* Maxsplit count */ 701 ); 702 703/* Dito, but split at line breaks. 704 705 CRLF is considered to be one line break. Line breaks are not 706 included in the resulting list. */ 707 708extern DL_IMPORT(PyObject*) PyUnicode_Splitlines( 709 PyObject *s, /* String to split */ 710 int keepends /* If true, line end markers are included */ 711 ); 712 713/* Translate a string by applying a character mapping table to it and 714 return the resulting Unicode object. 715 716 The mapping table must map Unicode ordinal integers to Unicode 717 ordinal integers or None (causing deletion of the character). 718 719 Mapping tables may be dictionaries or sequences. Unmapped character 720 ordinals (ones which cause a LookupError) are left untouched and 721 are copied as-is. 722 723*/ 724 725extern DL_IMPORT(PyObject *) PyUnicode_Translate( 726 PyObject *str, /* String */ 727 PyObject *table, /* Translate table */ 728 const char *errors /* error handling */ 729 ); 730 731/* Join a sequence of strings using the given separator and return 732 the resulting Unicode string. */ 733 734extern DL_IMPORT(PyObject*) PyUnicode_Join( 735 PyObject *separator, /* Separator string */ 736 PyObject *seq /* Sequence object */ 737 ); 738 739/* Return 1 if substr matches str[start:end] at the given tail end, 0 740 otherwise. */ 741 742extern DL_IMPORT(int) PyUnicode_Tailmatch( 743 PyObject *str, /* String */ 744 PyObject *substr, /* Prefix or Suffix string */ 745 int start, /* Start index */ 746 int end, /* Stop index */ 747 int direction /* Tail end: -1 prefix, +1 suffix */ 748 ); 749 750/* Return the first position of substr in str[start:end] using the 751 given search direction or -1 if not found. */ 752 753extern DL_IMPORT(int) PyUnicode_Find( 754 PyObject *str, /* String */ 755 PyObject *substr, /* Substring to find */ 756 int start, /* Start index */ 757 int end, /* Stop index */ 758 int direction /* Find direction: +1 forward, -1 backward */ 759 ); 760 761/* Count the number of occurrences of substr in str[start:end]. */ 762 763extern DL_IMPORT(int) PyUnicode_Count( 764 PyObject *str, /* String */ 765 PyObject *substr, /* Substring to count */ 766 int start, /* Start index */ 767 int end /* Stop index */ 768 ); 769 770/* Replace at most maxcount occurrences of substr in str with replstr 771 and return the resulting Unicode object. */ 772 773extern DL_IMPORT(PyObject *) PyUnicode_Replace( 774 PyObject *str, /* String */ 775 PyObject *substr, /* Substring to find */ 776 PyObject *replstr, /* Substring to replace */ 777 int maxcount /* Max. number of replacements to apply; 778 -1 = all */ 779 ); 780 781/* Compare two strings and return -1, 0, 1 for less than, equal, 782 greater than resp. */ 783 784extern DL_IMPORT(int) PyUnicode_Compare( 785 PyObject *left, /* Left string */ 786 PyObject *right /* Right string */ 787 ); 788 789/* Apply a argument tuple or dictionar to a format string and return 790 the resulting Unicode string. */ 791 792extern DL_IMPORT(PyObject *) PyUnicode_Format( 793 PyObject *format, /* Format string */ 794 PyObject *args /* Argument tuple or dictionary */ 795 ); 796 797/* Checks whether element is contained in container and return 1/0 798 accordingly. 799 800 element has to coerce to an one element Unicode string. -1 is 801 returned in case of an error. */ 802 803extern DL_IMPORT(int) PyUnicode_Contains( 804 PyObject *container, /* Container string */ 805 PyObject *element /* Element string */ 806 ); 807 808/* === Characters Type APIs =============================================== */ 809 810/* These should not be used directly. Use the Py_UNICODE_IS* and 811 Py_UNICODE_TO* macros instead. 812 813 These APIs are implemented in Objects/unicodectype.c. 814 815*/ 816 817extern DL_IMPORT(int) _PyUnicode_IsLowercase( 818 register const Py_UNICODE ch /* Unicode character */ 819 ); 820 821extern DL_IMPORT(int) _PyUnicode_IsUppercase( 822 register const Py_UNICODE ch /* Unicode character */ 823 ); 824 825extern DL_IMPORT(int) _PyUnicode_IsTitlecase( 826 register const Py_UNICODE ch /* Unicode character */ 827 ); 828 829extern DL_IMPORT(int) _PyUnicode_IsWhitespace( 830 register const Py_UNICODE ch /* Unicode character */ 831 ); 832 833extern DL_IMPORT(int) _PyUnicode_IsLinebreak( 834 register const Py_UNICODE ch /* Unicode character */ 835 ); 836 837extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase( 838 register const Py_UNICODE ch /* Unicode character */ 839 ); 840 841extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase( 842 register const Py_UNICODE ch /* Unicode character */ 843 ); 844 845extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase( 846 register const Py_UNICODE ch /* Unicode character */ 847 ); 848 849extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit( 850 register const Py_UNICODE ch /* Unicode character */ 851 ); 852 853extern DL_IMPORT(int) _PyUnicode_ToDigit( 854 register const Py_UNICODE ch /* Unicode character */ 855 ); 856 857extern DL_IMPORT(double) _PyUnicode_ToNumeric( 858 register const Py_UNICODE ch /* Unicode character */ 859 ); 860 861extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit( 862 register const Py_UNICODE ch /* Unicode character */ 863 ); 864 865extern DL_IMPORT(int) _PyUnicode_IsDigit( 866 register const Py_UNICODE ch /* Unicode character */ 867 ); 868 869extern DL_IMPORT(int) _PyUnicode_IsNumeric( 870 register const Py_UNICODE ch /* Unicode character */ 871 ); 872 873extern DL_IMPORT(int) _PyUnicode_IsAlpha( 874 register const Py_UNICODE ch /* Unicode character */ 875 ); 876 877#ifdef __cplusplus 878} 879#endif 880#endif /* !Py_UNICODEOBJECT_H */ 881