unicodeobject.h revision b3648576cd76232e618ecc227541c7b722355f6e
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal. (See 11http://www.egenix.com/files/python/unicode-proposal.txt). 12 13Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58#include <ctype.h> 59 60/* === Internal API ======================================================= */ 61 62/* --- Internal Unicode Format -------------------------------------------- */ 63 64/* Python 3.x requires unicode */ 65#define Py_USING_UNICODE 66 67#ifndef SIZEOF_WCHAR_T 68#error Must define SIZEOF_WCHAR_T 69#endif 70 71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Py_UNICODE was the native Unicode storage format (code unit) used by 87 Python and represents a single Unicode element in the Unicode type. 88 With PEP 393, Py_UNICODE is deprecated and replaced with a 89 typedef to wchar_t. */ 90 91#ifndef Py_LIMITED_API 92#define PY_UNICODE_TYPE wchar_t 93typedef wchar_t Py_UNICODE; 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar(), 98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#ifdef HAVE_WCHAR_H 107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 108# ifdef _HAVE_BSDI 109# include <time.h> 110# endif 111# include <wchar.h> 112#endif 113 114/* Py_UCS4 and Py_UCS2 are typedefs for the respective 115 unicode representations. */ 116typedef uint32_t Py_UCS4; 117typedef uint16_t Py_UCS2; 118typedef uint8_t Py_UCS1; 119 120/* --- Internal Unicode Operations ---------------------------------------- */ 121 122/* Since splitting on whitespace is an important use case, and 123 whitespace in most situations is solely ASCII whitespace, we 124 optimize for the common case by using a quick look-up table 125 _Py_ascii_whitespace (see below) with an inlined check. 126 127 */ 128#ifndef Py_LIMITED_API 129#define Py_UNICODE_ISSPACE(ch) \ 130 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 131 132#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 133#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 134#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 135#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 136 137#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 138#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 139#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 140 141#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 142#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 143#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 144#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 145 146#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 147#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 148#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 149 150#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 151 152#define Py_UNICODE_ISALNUM(ch) \ 153 (Py_UNICODE_ISALPHA(ch) || \ 154 Py_UNICODE_ISDECIMAL(ch) || \ 155 Py_UNICODE_ISDIGIT(ch) || \ 156 Py_UNICODE_ISNUMERIC(ch)) 157 158#define Py_UNICODE_COPY(target, source, length) \ 159 memcpy((target), (source), (length)*sizeof(Py_UNICODE)) 160 161#define Py_UNICODE_FILL(target, value, length) \ 162 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 163 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 164 } while (0) 165 166/* macros to work with surrogates */ 167#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) 168#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF) 169#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF) 170/* Join two surrogate characters and return a single Py_UCS4 value. */ 171#define Py_UNICODE_JOIN_SURROGATES(high, low) \ 172 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 173 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 174/* high surrogate = top 10 bits added to D800 */ 175#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) 176/* low surrogate = bottom 10 bits added to DC00 */ 177#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) 178 179/* Check if substring matches at given offset. The offset must be 180 valid, and the substring must not be empty. */ 181 182#define Py_UNICODE_MATCH(string, offset, substring) \ 183 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ 184 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ 185 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) 186 187#endif /* Py_LIMITED_API */ 188 189#ifdef __cplusplus 190extern "C" { 191#endif 192 193/* --- Unicode Type ------------------------------------------------------- */ 194 195#ifndef Py_LIMITED_API 196 197/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 198 structure. state.ascii and state.compact are set, and the data 199 immediately follow the structure. utf8_length and wstr_length can be found 200 in the length field; the utf8 pointer is equal to the data pointer. */ 201typedef struct { 202 /* There are 4 forms of Unicode strings: 203 204 - compact ascii: 205 206 * structure = PyASCIIObject 207 * test: PyUnicode_IS_COMPACT_ASCII(op) 208 * kind = PyUnicode_1BYTE_KIND 209 * compact = 1 210 * ascii = 1 211 * ready = 1 212 * (length is the length of the utf8 and wstr strings) 213 * (data starts just after the structure) 214 * (since ASCII is decoded from UTF-8, the utf8 string are the data) 215 216 - compact: 217 218 * structure = PyCompactUnicodeObject 219 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) 220 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 221 PyUnicode_4BYTE_KIND 222 * compact = 1 223 * ready = 1 224 * ascii = 0 225 * utf8 is not shared with data 226 * utf8_length = 0 if utf8 is NULL 227 * wstr is shared with data and wstr_length=length 228 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 229 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 230 * wstr_length = 0 if wstr is NULL 231 * (data starts just after the structure) 232 233 - legacy string, not ready: 234 235 * structure = PyUnicodeObject 236 * test: kind == PyUnicode_WCHAR_KIND 237 * length = 0 (use wstr_length) 238 * hash = -1 239 * kind = PyUnicode_WCHAR_KIND 240 * compact = 0 241 * ascii = 0 242 * ready = 0 243 * interned = SSTATE_NOT_INTERNED 244 * wstr is not NULL 245 * data.any is NULL 246 * utf8 is NULL 247 * utf8_length = 0 248 249 - legacy string, ready: 250 251 * structure = PyUnicodeObject structure 252 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND 253 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 254 PyUnicode_4BYTE_KIND 255 * compact = 0 256 * ready = 1 257 * data.any is not NULL 258 * utf8 is shared and utf8_length = length with data.any if ascii = 1 259 * utf8_length = 0 if utf8 is NULL 260 * wstr is shared with data.any and wstr_length = length 261 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 262 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 263 * wstr_length = 0 if wstr is NULL 264 265 Compact strings use only one memory block (structure + characters), 266 whereas legacy strings use one block for the structure and one block 267 for characters. 268 269 Legacy strings are created by PyUnicode_FromUnicode() and 270 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready 271 when PyUnicode_READY() is called. 272 273 See also _PyUnicode_CheckConsistency(). 274 */ 275 PyObject_HEAD 276 Py_ssize_t length; /* Number of code points in the string */ 277 Py_hash_t hash; /* Hash value; -1 if not set */ 278 struct { 279 /* 280 SSTATE_NOT_INTERNED (0) 281 SSTATE_INTERNED_MORTAL (1) 282 SSTATE_INTERNED_IMMORTAL (2) 283 284 If interned != SSTATE_NOT_INTERNED, the two references from the 285 dictionary to this object are *not* counted in ob_refcnt. 286 */ 287 unsigned int interned:2; 288 /* Character size: 289 290 - PyUnicode_WCHAR_KIND (0): 291 292 * character type = wchar_t (16 or 32 bits, depending on the 293 platform) 294 295 - PyUnicode_1BYTE_KIND (1): 296 297 * character type = Py_UCS1 (8 bits, unsigned) 298 * all characters are in the range U+0000-U+00FF (latin1) 299 * if ascii is set, all characters are in the range U+0000-U+007F 300 (ASCII), otherwise at least one character is in the range 301 U+0080-U+00FF 302 303 - PyUnicode_2BYTE_KIND (2): 304 305 * character type = Py_UCS2 (16 bits, unsigned) 306 * all characters are in the range U+0000-U+FFFF (BMP) 307 * at least one character is in the range U+0100-U+FFFF 308 309 - PyUnicode_4BYTE_KIND (4): 310 311 * character type = Py_UCS4 (32 bits, unsigned) 312 * all characters are in the range U+0000-U+10FFFF 313 * at least one character is in the range U+10000-U+10FFFF 314 */ 315 unsigned int kind:3; 316 /* Compact is with respect to the allocation scheme. Compact unicode 317 objects only require one memory block while non-compact objects use 318 one block for the PyUnicodeObject struct and another for its data 319 buffer. */ 320 unsigned int compact:1; 321 /* The string only contains characters in the range U+0000-U+007F (ASCII) 322 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is 323 set, use the PyASCIIObject structure. */ 324 unsigned int ascii:1; 325 /* The ready flag indicates whether the object layout is initialized 326 completely. This means that this is either a compact object, or 327 the data pointer is filled out. The bit is redundant, and helps 328 to minimize the test in PyUnicode_IS_READY(). */ 329 unsigned int ready:1; 330 /* Padding to ensure that PyUnicode_DATA() is always aligned to 331 4 bytes (see issue #19537 on m68k). */ 332 unsigned int :24; 333 } state; 334 wchar_t *wstr; /* wchar_t representation (null-terminated) */ 335} PyASCIIObject; 336 337/* Non-ASCII strings allocated through PyUnicode_New use the 338 PyCompactUnicodeObject structure. state.compact is set, and the data 339 immediately follow the structure. */ 340typedef struct { 341 PyASCIIObject _base; 342 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 343 * terminating \0. */ 344 char *utf8; /* UTF-8 representation (null-terminated) */ 345 Py_ssize_t wstr_length; /* Number of code points in wstr, possible 346 * surrogates count as two code points. */ 347} PyCompactUnicodeObject; 348 349/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the 350 PyUnicodeObject structure. The actual string data is initially in the wstr 351 block, and copied into the data block using _PyUnicode_Ready. */ 352typedef struct { 353 PyCompactUnicodeObject _base; 354 union { 355 void *any; 356 Py_UCS1 *latin1; 357 Py_UCS2 *ucs2; 358 Py_UCS4 *ucs4; 359 } data; /* Canonical, smallest-form Unicode buffer */ 360} PyUnicodeObject; 361#endif 362 363PyAPI_DATA(PyTypeObject) PyUnicode_Type; 364PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 365 366#define PyUnicode_Check(op) \ 367 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 368#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 369 370/* Fast access macros */ 371#ifndef Py_LIMITED_API 372 373#define PyUnicode_WSTR_LENGTH(op) \ 374 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 375 ((PyASCIIObject*)op)->length : \ 376 ((PyCompactUnicodeObject*)op)->wstr_length) 377 378/* Returns the deprecated Py_UNICODE representation's size in code units 379 (this includes surrogate pairs as 2 units). 380 If the Py_UNICODE representation is not available, it will be computed 381 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ 382 383#define PyUnicode_GET_SIZE(op) \ 384 (assert(PyUnicode_Check(op)), \ 385 (((PyASCIIObject *)(op))->wstr) ? \ 386 PyUnicode_WSTR_LENGTH(op) : \ 387 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ 388 assert(((PyASCIIObject *)(op))->wstr), \ 389 PyUnicode_WSTR_LENGTH(op))) 390 391#define PyUnicode_GET_DATA_SIZE(op) \ 392 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) 393 394/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE 395 representation on demand. Using this macro is very inefficient now, 396 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or 397 use PyUnicode_WRITE() and PyUnicode_READ(). */ 398 399#define PyUnicode_AS_UNICODE(op) \ 400 (assert(PyUnicode_Check(op)), \ 401 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ 402 PyUnicode_AsUnicode((PyObject *)(op))) 403 404#define PyUnicode_AS_DATA(op) \ 405 ((const char *)(PyUnicode_AS_UNICODE(op))) 406 407 408/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ 409 410/* Values for PyASCIIObject.state: */ 411 412/* Interning state. */ 413#define SSTATE_NOT_INTERNED 0 414#define SSTATE_INTERNED_MORTAL 1 415#define SSTATE_INTERNED_IMMORTAL 2 416 417/* Return true if the string contains only ASCII characters, or 0 if not. The 418 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be 419 ready. */ 420#define PyUnicode_IS_ASCII(op) \ 421 (assert(PyUnicode_Check(op)), \ 422 assert(PyUnicode_IS_READY(op)), \ 423 ((PyASCIIObject*)op)->state.ascii) 424 425/* Return true if the string is compact or 0 if not. 426 No type checks or Ready calls are performed. */ 427#define PyUnicode_IS_COMPACT(op) \ 428 (((PyASCIIObject*)(op))->state.compact) 429 430/* Return true if the string is a compact ASCII string (use PyASCIIObject 431 structure), or 0 if not. No type checks or Ready calls are performed. */ 432#define PyUnicode_IS_COMPACT_ASCII(op) \ 433 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op)) 434 435enum PyUnicode_Kind { 436/* String contains only wstr byte characters. This is only possible 437 when the string was created with a legacy API and _PyUnicode_Ready() 438 has not been called yet. */ 439 PyUnicode_WCHAR_KIND = 0, 440/* Return values of the PyUnicode_KIND() macro: */ 441 PyUnicode_1BYTE_KIND = 1, 442 PyUnicode_2BYTE_KIND = 2, 443 PyUnicode_4BYTE_KIND = 4 444}; 445 446/* Return pointers to the canonical representation cast to unsigned char, 447 Py_UCS2, or Py_UCS4 for direct character access. 448 No checks are performed, use PyUnicode_KIND() before to ensure 449 these will work correctly. */ 450 451#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) 452#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) 453#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) 454 455/* Return one of the PyUnicode_*_KIND values defined above. */ 456#define PyUnicode_KIND(op) \ 457 (assert(PyUnicode_Check(op)), \ 458 assert(PyUnicode_IS_READY(op)), \ 459 ((PyASCIIObject *)(op))->state.kind) 460 461/* Return a void pointer to the raw unicode buffer. */ 462#define _PyUnicode_COMPACT_DATA(op) \ 463 (PyUnicode_IS_ASCII(op) ? \ 464 ((void*)((PyASCIIObject*)(op) + 1)) : \ 465 ((void*)((PyCompactUnicodeObject*)(op) + 1))) 466 467#define _PyUnicode_NONCOMPACT_DATA(op) \ 468 (assert(((PyUnicodeObject*)(op))->data.any), \ 469 ((((PyUnicodeObject *)(op))->data.any))) 470 471#define PyUnicode_DATA(op) \ 472 (assert(PyUnicode_Check(op)), \ 473 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ 474 _PyUnicode_NONCOMPACT_DATA(op)) 475 476/* In the access macros below, "kind" may be evaluated more than once. 477 All other macro parameters are evaluated exactly once, so it is safe 478 to put side effects into them (such as increasing the index). */ 479 480/* Write into the canonical representation, this macro does not do any sanity 481 checks and is intended for usage in loops. The caller should cache the 482 kind and data pointers obtained from other macro calls. 483 index is the index in the string (starts at 0) and value is the new 484 code point value which should be written to that location. */ 485#define PyUnicode_WRITE(kind, data, index, value) \ 486 do { \ 487 switch ((kind)) { \ 488 case PyUnicode_1BYTE_KIND: { \ 489 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ 490 break; \ 491 } \ 492 case PyUnicode_2BYTE_KIND: { \ 493 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ 494 break; \ 495 } \ 496 default: { \ 497 assert((kind) == PyUnicode_4BYTE_KIND); \ 498 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ 499 } \ 500 } \ 501 } while (0) 502 503/* Read a code point from the string's canonical representation. No checks 504 or ready calls are performed. */ 505#define PyUnicode_READ(kind, data, index) \ 506 ((Py_UCS4) \ 507 ((kind) == PyUnicode_1BYTE_KIND ? \ 508 ((const Py_UCS1 *)(data))[(index)] : \ 509 ((kind) == PyUnicode_2BYTE_KIND ? \ 510 ((const Py_UCS2 *)(data))[(index)] : \ 511 ((const Py_UCS4 *)(data))[(index)] \ 512 ) \ 513 )) 514 515/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 516 calls PyUnicode_KIND() and might call it twice. For single reads, use 517 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 518 cache kind and use PyUnicode_READ instead. */ 519#define PyUnicode_READ_CHAR(unicode, index) \ 520 (assert(PyUnicode_Check(unicode)), \ 521 assert(PyUnicode_IS_READY(unicode)), \ 522 (Py_UCS4) \ 523 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ 524 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ 525 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ 526 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ 527 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ 528 ) \ 529 )) 530 531/* Returns the length of the unicode string. The caller has to make sure that 532 the string has it's canonical representation set before calling 533 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ 534#define PyUnicode_GET_LENGTH(op) \ 535 (assert(PyUnicode_Check(op)), \ 536 assert(PyUnicode_IS_READY(op)), \ 537 ((PyASCIIObject *)(op))->length) 538 539 540/* Fast check to determine whether an object is ready. Equivalent to 541 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ 542 543#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) 544 545/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best 546 case. If the canonical representation is not yet set, it will still call 547 _PyUnicode_Ready(). 548 Returns 0 on success and -1 on errors. */ 549#define PyUnicode_READY(op) \ 550 (assert(PyUnicode_Check(op)), \ 551 (PyUnicode_IS_READY(op) ? \ 552 0 : _PyUnicode_Ready((PyObject *)(op)))) 553 554/* Return a maximum character value which is suitable for creating another 555 string based on op. This is always an approximation but more efficient 556 than iterating over the string. */ 557#define PyUnicode_MAX_CHAR_VALUE(op) \ 558 (assert(PyUnicode_IS_READY(op)), \ 559 (PyUnicode_IS_ASCII(op) ? \ 560 (0x7f) : \ 561 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ 562 (0xffU) : \ 563 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ 564 (0xffffU) : \ 565 (0x10ffffU))))) 566 567#endif 568 569/* --- Constants ---------------------------------------------------------- */ 570 571/* This Unicode character will be used as replacement character during 572 decoding if the errors argument is set to "replace". Note: the 573 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 574 Unicode 3.0. */ 575 576#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 577 578/* === Public API ========================================================= */ 579 580/* --- Plain Py_UNICODE --------------------------------------------------- */ 581 582/* With PEP 393, this is the recommended way to allocate a new unicode object. 583 This function will allocate the object and its buffer in a single memory 584 block. Objects created using this function are not resizable. */ 585#ifndef Py_LIMITED_API 586PyAPI_FUNC(PyObject*) PyUnicode_New( 587 Py_ssize_t size, /* Number of code points in the new string */ 588 Py_UCS4 maxchar /* maximum code point value in the string */ 589 ); 590#endif 591 592/* Initializes the canonical string representation from the deprecated 593 wstr/Py_UNICODE representation. This function is used to convert Unicode 594 objects which were created using the old API to the new flexible format 595 introduced with PEP 393. 596 597 Don't call this function directly, use the public PyUnicode_READY() macro 598 instead. */ 599#ifndef Py_LIMITED_API 600PyAPI_FUNC(int) _PyUnicode_Ready( 601 PyObject *unicode /* Unicode object */ 602 ); 603#endif 604 605/* Get a copy of a Unicode string. */ 606#ifndef Py_LIMITED_API 607PyAPI_FUNC(PyObject*) _PyUnicode_Copy( 608 PyObject *unicode 609 ); 610#endif 611 612/* Copy character from one unicode object into another, this function performs 613 character conversion when necessary and falls back to memcpy() if possible. 614 615 Fail if to is too small (smaller than *how_many* or smaller than 616 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 617 kind(to), or if *to* has more than 1 reference. 618 619 Return the number of written character, or return -1 and raise an exception 620 on error. 621 622 Pseudo-code: 623 624 how_many = min(how_many, len(from) - from_start) 625 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 626 return how_many 627 628 Note: The function doesn't write a terminating null character. 629 */ 630#ifndef Py_LIMITED_API 631PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 632 PyObject *to, 633 Py_ssize_t to_start, 634 PyObject *from, 635 Py_ssize_t from_start, 636 Py_ssize_t how_many 637 ); 638 639/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so 640 may crash if parameters are invalid (e.g. if the output string 641 is too short). */ 642PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters( 643 PyObject *to, 644 Py_ssize_t to_start, 645 PyObject *from, 646 Py_ssize_t from_start, 647 Py_ssize_t how_many 648 ); 649#endif 650 651#ifndef Py_LIMITED_API 652/* Fill a string with a character: write fill_char into 653 unicode[start:start+length]. 654 655 Fail if fill_char is bigger than the string maximum character, or if the 656 string has more than 1 reference. 657 658 Return the number of written character, or return -1 and raise an exception 659 on error. */ 660PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( 661 PyObject *unicode, 662 Py_ssize_t start, 663 Py_ssize_t length, 664 Py_UCS4 fill_char 665 ); 666 667/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash 668 if parameters are invalid (e.g. if length is longer than the string). */ 669PyAPI_FUNC(void) _PyUnicode_FastFill( 670 PyObject *unicode, 671 Py_ssize_t start, 672 Py_ssize_t length, 673 Py_UCS4 fill_char 674 ); 675#endif 676 677/* Create a Unicode Object from the Py_UNICODE buffer u of the given 678 size. 679 680 u may be NULL which causes the contents to be undefined. It is the 681 user's responsibility to fill in the needed data afterwards. Note 682 that modifying the Unicode object contents after construction is 683 only allowed if u was set to NULL. 684 685 The buffer is copied into the new object. */ 686 687#ifndef Py_LIMITED_API 688PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 689 const Py_UNICODE *u, /* Unicode buffer */ 690 Py_ssize_t size /* size of buffer */ 691 ); 692#endif 693 694/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 695PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 696 const char *u, /* UTF-8 encoded string */ 697 Py_ssize_t size /* size of buffer */ 698 ); 699 700/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 701 UTF-8 encoded bytes. The size is determined with strlen(). */ 702PyAPI_FUNC(PyObject*) PyUnicode_FromString( 703 const char *u /* UTF-8 encoded string */ 704 ); 705 706#ifndef Py_LIMITED_API 707/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. 708 Scan the string to find the maximum character. */ 709PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 710 int kind, 711 const void *buffer, 712 Py_ssize_t size); 713 714/* Create a new string from a buffer of ASCII characters. 715 WARNING: Don't check if the string contains any non-ASCII character. */ 716PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII( 717 const char *buffer, 718 Py_ssize_t size); 719#endif 720 721PyAPI_FUNC(PyObject*) PyUnicode_Substring( 722 PyObject *str, 723 Py_ssize_t start, 724 Py_ssize_t end); 725 726#ifndef Py_LIMITED_API 727/* Compute the maximum character of the substring unicode[start:end]. 728 Return 127 for an empty string. */ 729PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( 730 PyObject *unicode, 731 Py_ssize_t start, 732 Py_ssize_t end); 733#endif 734 735/* Copy the string into a UCS4 buffer including the null character if copy_null 736 is set. Return NULL and raise an exception on error. Raise a SystemError if 737 the buffer is smaller than the string. Return buffer on success. 738 739 buflen is the length of the buffer in (Py_UCS4) characters. */ 740PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 741 PyObject *unicode, 742 Py_UCS4* buffer, 743 Py_ssize_t buflen, 744 int copy_null); 745 746/* Copy the string into a UCS4 buffer. A new buffer is allocated using 747 * PyMem_Malloc; if this fails, NULL is returned with a memory error 748 exception set. */ 749PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 750 751/* Return a read-only pointer to the Unicode object's internal 752 Py_UNICODE buffer. 753 If the wchar_t/Py_UNICODE representation is not yet available, this 754 function will calculate it. */ 755 756#ifndef Py_LIMITED_API 757PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 758 PyObject *unicode /* Unicode object */ 759 ); 760#endif 761 762/* Return a read-only pointer to the Unicode object's internal 763 Py_UNICODE buffer and save the length at size. 764 If the wchar_t/Py_UNICODE representation is not yet available, this 765 function will calculate it. */ 766 767#ifndef Py_LIMITED_API 768PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( 769 PyObject *unicode, /* Unicode object */ 770 Py_ssize_t *size /* location where to save the length */ 771 ); 772#endif 773 774/* Get the length of the Unicode object. */ 775 776PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 777 PyObject *unicode 778); 779 780/* Get the number of Py_UNICODE units in the 781 string representation. */ 782 783PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 784 PyObject *unicode /* Unicode object */ 785 ); 786 787/* Read a character from the string. */ 788 789PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 790 PyObject *unicode, 791 Py_ssize_t index 792 ); 793 794/* Write a character to the string. The string must have been created through 795 PyUnicode_New, must not be shared, and must not have been hashed yet. 796 797 Return 0 on success, -1 on error. */ 798 799PyAPI_FUNC(int) PyUnicode_WriteChar( 800 PyObject *unicode, 801 Py_ssize_t index, 802 Py_UCS4 character 803 ); 804 805#ifndef Py_LIMITED_API 806/* Get the maximum ordinal for a Unicode character. */ 807PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 808#endif 809 810/* Resize a Unicode object. The length is the number of characters, except 811 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 812 is the number of Py_UNICODE characters. 813 814 *unicode is modified to point to the new (resized) object and 0 815 returned on success. 816 817 Try to resize the string in place (which is usually faster than allocating 818 a new string and copy characters), or create a new string. 819 820 Error handling is implemented as follows: an exception is set, -1 821 is returned and *unicode left untouched. 822 823 WARNING: The function doesn't check string content, the result may not be a 824 string in canonical representation. */ 825 826PyAPI_FUNC(int) PyUnicode_Resize( 827 PyObject **unicode, /* Pointer to the Unicode object */ 828 Py_ssize_t length /* New length */ 829 ); 830 831/* Decode obj to a Unicode object. 832 833 bytes, bytearray and other bytes-like objects are decoded according to the 834 given encoding and error handler. The encoding and error handler can be 835 NULL to have the interface use UTF-8 and "strict". 836 837 All other objects (including Unicode objects) raise an exception. 838 839 The API returns NULL in case of an error. The caller is responsible 840 for decref'ing the returned objects. 841 842*/ 843 844PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 845 PyObject *obj, /* Object */ 846 const char *encoding, /* encoding */ 847 const char *errors /* error handling */ 848 ); 849 850/* Copy an instance of a Unicode subtype to a new true Unicode object if 851 necessary. If obj is already a true Unicode object (not a subtype), return 852 the reference with *incremented* refcount. 853 854 The API returns NULL in case of an error. The caller is responsible 855 for decref'ing the returned objects. 856 857*/ 858 859PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 860 PyObject *obj /* Object */ 861 ); 862 863PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 864 const char *format, /* ASCII-encoded string */ 865 va_list vargs 866 ); 867PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 868 const char *format, /* ASCII-encoded string */ 869 ... 870 ); 871 872#ifndef Py_LIMITED_API 873typedef struct { 874 PyObject *buffer; 875 void *data; 876 enum PyUnicode_Kind kind; 877 Py_UCS4 maxchar; 878 Py_ssize_t size; 879 Py_ssize_t pos; 880 881 /* minimum number of allocated characters (default: 0) */ 882 Py_ssize_t min_length; 883 884 /* minimum character (default: 127, ASCII) */ 885 Py_UCS4 min_char; 886 887 /* If non-zero, overallocate the buffer (default: 0). */ 888 unsigned char overallocate; 889 890 /* If readonly is 1, buffer is a shared string (cannot be modified) 891 and size is set to 0. */ 892 unsigned char readonly; 893} _PyUnicodeWriter ; 894 895/* Initialize a Unicode writer. 896 * 897 * By default, the minimum buffer size is 0 character and overallocation is 898 * disabled. Set min_length, min_char and overallocate attributes to control 899 * the allocation of the buffer. */ 900PyAPI_FUNC(void) 901_PyUnicodeWriter_Init(_PyUnicodeWriter *writer); 902 903/* Prepare the buffer to write 'length' characters 904 with the specified maximum character. 905 906 Return 0 on success, raise an exception and return -1 on error. */ 907#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ 908 (((MAXCHAR) <= (WRITER)->maxchar \ 909 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ 910 ? 0 \ 911 : (((LENGTH) == 0) \ 912 ? 0 \ 913 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) 914 915/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro 916 instead. */ 917PyAPI_FUNC(int) 918_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 919 Py_ssize_t length, Py_UCS4 maxchar); 920 921/* Prepare the buffer to have at least the kind KIND. 922 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will 923 support characters in range U+000-U+FFFF. 924 925 Return 0 on success, raise an exception and return -1 on error. */ 926#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ 927 (assert((KIND) != PyUnicode_WCHAR_KIND), \ 928 (KIND) <= (WRITER)->kind \ 929 ? 0 \ 930 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) 931 932/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind() 933 macro instead. */ 934PyAPI_FUNC(int) 935_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 936 enum PyUnicode_Kind kind); 937 938/* Append a Unicode character. 939 Return 0 on success, raise an exception and return -1 on error. */ 940PyAPI_FUNC(int) 941_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, 942 Py_UCS4 ch 943 ); 944 945/* Append a Unicode string. 946 Return 0 on success, raise an exception and return -1 on error. */ 947PyAPI_FUNC(int) 948_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, 949 PyObject *str /* Unicode string */ 950 ); 951 952/* Append a substring of a Unicode string. 953 Return 0 on success, raise an exception and return -1 on error. */ 954PyAPI_FUNC(int) 955_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, 956 PyObject *str, /* Unicode string */ 957 Py_ssize_t start, 958 Py_ssize_t end 959 ); 960 961/* Append an ASCII-encoded byte string. 962 Return 0 on success, raise an exception and return -1 on error. */ 963PyAPI_FUNC(int) 964_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 965 const char *str, /* ASCII-encoded byte string */ 966 Py_ssize_t len /* number of bytes, or -1 if unknown */ 967 ); 968 969/* Append a latin1-encoded byte string. 970 Return 0 on success, raise an exception and return -1 on error. */ 971PyAPI_FUNC(int) 972_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 973 const char *str, /* latin1-encoded byte string */ 974 Py_ssize_t len /* length in bytes */ 975 ); 976 977/* Get the value of the writer as a Unicode string. Clear the 978 buffer of the writer. Raise an exception and return NULL 979 on error. */ 980PyAPI_FUNC(PyObject *) 981_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); 982 983/* Deallocate memory of a writer (clear its internal buffer). */ 984PyAPI_FUNC(void) 985_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); 986#endif 987 988#ifndef Py_LIMITED_API 989/* Format the object based on the format_spec, as defined in PEP 3101 990 (Advanced String Formatting). */ 991PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter( 992 _PyUnicodeWriter *writer, 993 PyObject *obj, 994 PyObject *format_spec, 995 Py_ssize_t start, 996 Py_ssize_t end); 997#endif 998 999PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 1000PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 1001PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 1002 const char *u /* UTF-8 encoded string */ 1003 ); 1004#ifndef Py_LIMITED_API 1005PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 1006#endif 1007 1008/* Use only if you know it's a string */ 1009#define PyUnicode_CHECK_INTERNED(op) \ 1010 (((PyASCIIObject *)(op))->state.interned) 1011 1012/* --- wchar_t support for platforms which support it --------------------- */ 1013 1014#ifdef HAVE_WCHAR_H 1015 1016/* Create a Unicode Object from the wchar_t buffer w of the given 1017 size. 1018 1019 The buffer is copied into the new object. */ 1020 1021PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 1022 const wchar_t *w, /* wchar_t buffer */ 1023 Py_ssize_t size /* size of buffer */ 1024 ); 1025 1026/* Copies the Unicode Object contents into the wchar_t buffer w. At 1027 most size wchar_t characters are copied. 1028 1029 Note that the resulting wchar_t string may or may not be 1030 0-terminated. It is the responsibility of the caller to make sure 1031 that the wchar_t string is 0-terminated in case this is required by 1032 the application. 1033 1034 Returns the number of wchar_t characters copied (excluding a 1035 possibly trailing 0-termination character) or -1 in case of an 1036 error. */ 1037 1038PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 1039 PyObject *unicode, /* Unicode object */ 1040 wchar_t *w, /* wchar_t buffer */ 1041 Py_ssize_t size /* size of buffer */ 1042 ); 1043 1044/* Convert the Unicode object to a wide character string. The output string 1045 always ends with a nul character. If size is not NULL, write the number of 1046 wide characters (excluding the null character) into *size. 1047 1048 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 1049 on success. On error, returns NULL, *size is undefined and raises a 1050 MemoryError. */ 1051 1052PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 1053 PyObject *unicode, /* Unicode object */ 1054 Py_ssize_t *size /* number of characters of the result */ 1055 ); 1056 1057#ifndef Py_LIMITED_API 1058PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); 1059#endif 1060 1061#endif 1062 1063/* --- Unicode ordinals --------------------------------------------------- */ 1064 1065/* Create a Unicode Object from the given Unicode code point ordinal. 1066 1067 The ordinal must be in range(0x110000). A ValueError is 1068 raised in case it is not. 1069 1070*/ 1071 1072PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 1073 1074/* --- Free-list management ----------------------------------------------- */ 1075 1076/* Clear the free list used by the Unicode implementation. 1077 1078 This can be used to release memory used for objects on the free 1079 list back to the Python memory allocator. 1080 1081*/ 1082 1083PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 1084 1085/* === Builtin Codecs ===================================================== 1086 1087 Many of these APIs take two arguments encoding and errors. These 1088 parameters encoding and errors have the same semantics as the ones 1089 of the builtin str() API. 1090 1091 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 1092 1093 Error handling is set by errors which may also be set to NULL 1094 meaning to use the default handling defined for the codec. Default 1095 error handling for all builtin codecs is "strict" (ValueErrors are 1096 raised). 1097 1098 The codecs all use a similar interface. Only deviation from the 1099 generic ones are documented. 1100 1101*/ 1102 1103/* --- Manage the default encoding ---------------------------------------- */ 1104 1105/* Returns a pointer to the default encoding (UTF-8) of the 1106 Unicode object unicode and the size of the encoded representation 1107 in bytes stored in *size. 1108 1109 In case of an error, no *size is set. 1110 1111 This function caches the UTF-8 encoded string in the unicodeobject 1112 and subsequent calls will return the same string. The memory is released 1113 when the unicodeobject is deallocated. 1114 1115 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to 1116 support the previous internal function with the same behaviour. 1117 1118 *** This API is for interpreter INTERNAL USE ONLY and will likely 1119 *** be removed or changed in the future. 1120 1121 *** If you need to access the Unicode object as UTF-8 bytes string, 1122 *** please use PyUnicode_AsUTF8String() instead. 1123*/ 1124 1125#ifndef Py_LIMITED_API 1126PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( 1127 PyObject *unicode, 1128 Py_ssize_t *size); 1129#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize 1130#endif 1131 1132/* Returns a pointer to the default encoding (UTF-8) of the 1133 Unicode object unicode. 1134 1135 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 1136 in the unicodeobject. 1137 1138 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 1139 support the previous internal function with the same behaviour. 1140 1141 Use of this API is DEPRECATED since no size information can be 1142 extracted from the returned data. 1143 1144 *** This API is for interpreter INTERNAL USE ONLY and will likely 1145 *** be removed or changed for Python 3.1. 1146 1147 *** If you need to access the Unicode object as UTF-8 bytes string, 1148 *** please use PyUnicode_AsUTF8String() instead. 1149 1150*/ 1151 1152#ifndef Py_LIMITED_API 1153PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); 1154#define _PyUnicode_AsString PyUnicode_AsUTF8 1155#endif 1156 1157/* Returns "utf-8". */ 1158 1159PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 1160 1161/* --- Generic Codecs ----------------------------------------------------- */ 1162 1163/* Create a Unicode object by decoding the encoded string s of the 1164 given size. */ 1165 1166PyAPI_FUNC(PyObject*) PyUnicode_Decode( 1167 const char *s, /* encoded string */ 1168 Py_ssize_t size, /* size of buffer */ 1169 const char *encoding, /* encoding */ 1170 const char *errors /* error handling */ 1171 ); 1172 1173/* Decode a Unicode object unicode and return the result as Python 1174 object. */ 1175 1176PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 1177 PyObject *unicode, /* Unicode object */ 1178 const char *encoding, /* encoding */ 1179 const char *errors /* error handling */ 1180 ); 1181 1182/* Decode a Unicode object unicode and return the result as Unicode 1183 object. */ 1184 1185PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 1186 PyObject *unicode, /* Unicode object */ 1187 const char *encoding, /* encoding */ 1188 const char *errors /* error handling */ 1189 ); 1190 1191/* Encodes a Py_UNICODE buffer of the given size and returns a 1192 Python string object. */ 1193 1194#ifndef Py_LIMITED_API 1195PyAPI_FUNC(PyObject*) PyUnicode_Encode( 1196 const Py_UNICODE *s, /* Unicode char buffer */ 1197 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 1198 const char *encoding, /* encoding */ 1199 const char *errors /* error handling */ 1200 ); 1201#endif 1202 1203/* Encodes a Unicode object and returns the result as Python 1204 object. */ 1205 1206PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 1207 PyObject *unicode, /* Unicode object */ 1208 const char *encoding, /* encoding */ 1209 const char *errors /* error handling */ 1210 ); 1211 1212/* Encodes a Unicode object and returns the result as Python string 1213 object. */ 1214 1215PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 1216 PyObject *unicode, /* Unicode object */ 1217 const char *encoding, /* encoding */ 1218 const char *errors /* error handling */ 1219 ); 1220 1221/* Encodes a Unicode object and returns the result as Unicode 1222 object. */ 1223 1224PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 1225 PyObject *unicode, /* Unicode object */ 1226 const char *encoding, /* encoding */ 1227 const char *errors /* error handling */ 1228 ); 1229 1230/* Build an encoding map. */ 1231 1232PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 1233 PyObject* string /* 256 character map */ 1234 ); 1235 1236/* --- UTF-7 Codecs ------------------------------------------------------- */ 1237 1238PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 1239 const char *string, /* UTF-7 encoded string */ 1240 Py_ssize_t length, /* size of string */ 1241 const char *errors /* error handling */ 1242 ); 1243 1244PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 1245 const char *string, /* UTF-7 encoded string */ 1246 Py_ssize_t length, /* size of string */ 1247 const char *errors, /* error handling */ 1248 Py_ssize_t *consumed /* bytes consumed */ 1249 ); 1250 1251#ifndef Py_LIMITED_API 1252PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 1253 const Py_UNICODE *data, /* Unicode char buffer */ 1254 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1255 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1256 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1257 const char *errors /* error handling */ 1258 ); 1259PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( 1260 PyObject *unicode, /* Unicode object */ 1261 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1262 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1263 const char *errors /* error handling */ 1264 ); 1265#endif 1266 1267/* --- UTF-8 Codecs ------------------------------------------------------- */ 1268 1269PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 1270 const char *string, /* UTF-8 encoded string */ 1271 Py_ssize_t length, /* size of string */ 1272 const char *errors /* error handling */ 1273 ); 1274 1275PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 1276 const char *string, /* UTF-8 encoded string */ 1277 Py_ssize_t length, /* size of string */ 1278 const char *errors, /* error handling */ 1279 Py_ssize_t *consumed /* bytes consumed */ 1280 ); 1281 1282PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 1283 PyObject *unicode /* Unicode object */ 1284 ); 1285 1286#ifndef Py_LIMITED_API 1287PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 1288 PyObject *unicode, 1289 const char *errors); 1290 1291PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 1292 const Py_UNICODE *data, /* Unicode char buffer */ 1293 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1294 const char *errors /* error handling */ 1295 ); 1296#endif 1297 1298/* --- UTF-32 Codecs ------------------------------------------------------ */ 1299 1300/* Decodes length bytes from a UTF-32 encoded buffer string and returns 1301 the corresponding Unicode object. 1302 1303 errors (if non-NULL) defines the error handling. It defaults 1304 to "strict". 1305 1306 If byteorder is non-NULL, the decoder starts decoding using the 1307 given byte order: 1308 1309 *byteorder == -1: little endian 1310 *byteorder == 0: native order 1311 *byteorder == 1: big endian 1312 1313 In native mode, the first four bytes of the stream are checked for a 1314 BOM mark. If found, the BOM mark is analysed, the byte order 1315 adjusted and the BOM skipped. In the other modes, no BOM mark 1316 interpretation is done. After completion, *byteorder is set to the 1317 current byte order at the end of input data. 1318 1319 If byteorder is NULL, the codec starts in native order mode. 1320 1321*/ 1322 1323PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 1324 const char *string, /* UTF-32 encoded string */ 1325 Py_ssize_t length, /* size of string */ 1326 const char *errors, /* error handling */ 1327 int *byteorder /* pointer to byteorder to use 1328 0=native;-1=LE,1=BE; updated on 1329 exit */ 1330 ); 1331 1332PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 1333 const char *string, /* UTF-32 encoded string */ 1334 Py_ssize_t length, /* size of string */ 1335 const char *errors, /* error handling */ 1336 int *byteorder, /* pointer to byteorder to use 1337 0=native;-1=LE,1=BE; updated on 1338 exit */ 1339 Py_ssize_t *consumed /* bytes consumed */ 1340 ); 1341 1342/* Returns a Python string using the UTF-32 encoding in native byte 1343 order. The string always starts with a BOM mark. */ 1344 1345PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 1346 PyObject *unicode /* Unicode object */ 1347 ); 1348 1349/* Returns a Python string object holding the UTF-32 encoded value of 1350 the Unicode data. 1351 1352 If byteorder is not 0, output is written according to the following 1353 byte order: 1354 1355 byteorder == -1: little endian 1356 byteorder == 0: native byte order (writes a BOM mark) 1357 byteorder == 1: big endian 1358 1359 If byteorder is 0, the output string will always start with the 1360 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1361 prepended. 1362 1363*/ 1364 1365#ifndef Py_LIMITED_API 1366PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 1367 const Py_UNICODE *data, /* Unicode char buffer */ 1368 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1369 const char *errors, /* error handling */ 1370 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1371 ); 1372PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( 1373 PyObject *object, /* Unicode object */ 1374 const char *errors, /* error handling */ 1375 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1376 ); 1377#endif 1378 1379/* --- UTF-16 Codecs ------------------------------------------------------ */ 1380 1381/* Decodes length bytes from a UTF-16 encoded buffer string and returns 1382 the corresponding Unicode object. 1383 1384 errors (if non-NULL) defines the error handling. It defaults 1385 to "strict". 1386 1387 If byteorder is non-NULL, the decoder starts decoding using the 1388 given byte order: 1389 1390 *byteorder == -1: little endian 1391 *byteorder == 0: native order 1392 *byteorder == 1: big endian 1393 1394 In native mode, the first two bytes of the stream are checked for a 1395 BOM mark. If found, the BOM mark is analysed, the byte order 1396 adjusted and the BOM skipped. In the other modes, no BOM mark 1397 interpretation is done. After completion, *byteorder is set to the 1398 current byte order at the end of input data. 1399 1400 If byteorder is NULL, the codec starts in native order mode. 1401 1402*/ 1403 1404PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 1405 const char *string, /* UTF-16 encoded string */ 1406 Py_ssize_t length, /* size of string */ 1407 const char *errors, /* error handling */ 1408 int *byteorder /* pointer to byteorder to use 1409 0=native;-1=LE,1=BE; updated on 1410 exit */ 1411 ); 1412 1413PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 1414 const char *string, /* UTF-16 encoded string */ 1415 Py_ssize_t length, /* size of string */ 1416 const char *errors, /* error handling */ 1417 int *byteorder, /* pointer to byteorder to use 1418 0=native;-1=LE,1=BE; updated on 1419 exit */ 1420 Py_ssize_t *consumed /* bytes consumed */ 1421 ); 1422 1423/* Returns a Python string using the UTF-16 encoding in native byte 1424 order. The string always starts with a BOM mark. */ 1425 1426PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 1427 PyObject *unicode /* Unicode object */ 1428 ); 1429 1430/* Returns a Python string object holding the UTF-16 encoded value of 1431 the Unicode data. 1432 1433 If byteorder is not 0, output is written according to the following 1434 byte order: 1435 1436 byteorder == -1: little endian 1437 byteorder == 0: native byte order (writes a BOM mark) 1438 byteorder == 1: big endian 1439 1440 If byteorder is 0, the output string will always start with the 1441 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1442 prepended. 1443 1444 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 1445 UCS-2. This trick makes it possible to add full UTF-16 capabilities 1446 at a later point without compromising the APIs. 1447 1448*/ 1449 1450#ifndef Py_LIMITED_API 1451PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 1452 const Py_UNICODE *data, /* Unicode char buffer */ 1453 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1454 const char *errors, /* error handling */ 1455 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1456 ); 1457PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( 1458 PyObject* unicode, /* Unicode object */ 1459 const char *errors, /* error handling */ 1460 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1461 ); 1462#endif 1463 1464/* --- Unicode-Escape Codecs ---------------------------------------------- */ 1465 1466PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1467 const char *string, /* Unicode-Escape encoded string */ 1468 Py_ssize_t length, /* size of string */ 1469 const char *errors /* error handling */ 1470 ); 1471 1472PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1473 PyObject *unicode /* Unicode object */ 1474 ); 1475 1476#ifndef Py_LIMITED_API 1477PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1478 const Py_UNICODE *data, /* Unicode char buffer */ 1479 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1480 ); 1481#endif 1482 1483/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1484 1485PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1486 const char *string, /* Raw-Unicode-Escape encoded string */ 1487 Py_ssize_t length, /* size of string */ 1488 const char *errors /* error handling */ 1489 ); 1490 1491PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1492 PyObject *unicode /* Unicode object */ 1493 ); 1494 1495#ifndef Py_LIMITED_API 1496PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1497 const Py_UNICODE *data, /* Unicode char buffer */ 1498 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1499 ); 1500#endif 1501 1502/* --- Unicode Internal Codec --------------------------------------------- 1503 1504 Only for internal use in _codecsmodule.c */ 1505 1506#ifndef Py_LIMITED_API 1507PyObject *_PyUnicode_DecodeUnicodeInternal( 1508 const char *string, 1509 Py_ssize_t length, 1510 const char *errors 1511 ); 1512#endif 1513 1514/* --- Latin-1 Codecs ----------------------------------------------------- 1515 1516 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1517 1518*/ 1519 1520PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1521 const char *string, /* Latin-1 encoded string */ 1522 Py_ssize_t length, /* size of string */ 1523 const char *errors /* error handling */ 1524 ); 1525 1526PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1527 PyObject *unicode /* Unicode object */ 1528 ); 1529 1530#ifndef Py_LIMITED_API 1531PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 1532 PyObject* unicode, 1533 const char* errors); 1534 1535PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1536 const Py_UNICODE *data, /* Unicode char buffer */ 1537 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1538 const char *errors /* error handling */ 1539 ); 1540#endif 1541 1542/* --- ASCII Codecs ------------------------------------------------------- 1543 1544 Only 7-bit ASCII data is excepted. All other codes generate errors. 1545 1546*/ 1547 1548PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1549 const char *string, /* ASCII encoded string */ 1550 Py_ssize_t length, /* size of string */ 1551 const char *errors /* error handling */ 1552 ); 1553 1554PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1555 PyObject *unicode /* Unicode object */ 1556 ); 1557 1558#ifndef Py_LIMITED_API 1559PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 1560 PyObject* unicode, 1561 const char* errors); 1562 1563PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1564 const Py_UNICODE *data, /* Unicode char buffer */ 1565 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1566 const char *errors /* error handling */ 1567 ); 1568#endif 1569 1570/* --- Character Map Codecs ----------------------------------------------- 1571 1572 This codec uses mappings to encode and decode characters. 1573 1574 Decoding mappings must map single string characters to single 1575 Unicode characters, integers (which are then interpreted as Unicode 1576 ordinals) or None (meaning "undefined mapping" and causing an 1577 error). 1578 1579 Encoding mappings must map single Unicode characters to single 1580 string characters, integers (which are then interpreted as Latin-1 1581 ordinals) or None (meaning "undefined mapping" and causing an 1582 error). 1583 1584 If a character lookup fails with a LookupError, the character is 1585 copied as-is meaning that its ordinal value will be interpreted as 1586 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1587 to contain those mappings which map characters to different code 1588 points. 1589 1590*/ 1591 1592PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1593 const char *string, /* Encoded string */ 1594 Py_ssize_t length, /* size of string */ 1595 PyObject *mapping, /* character mapping 1596 (char ordinal -> unicode ordinal) */ 1597 const char *errors /* error handling */ 1598 ); 1599 1600PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1601 PyObject *unicode, /* Unicode object */ 1602 PyObject *mapping /* character mapping 1603 (unicode ordinal -> char ordinal) */ 1604 ); 1605 1606#ifndef Py_LIMITED_API 1607PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1608 const Py_UNICODE *data, /* Unicode char buffer */ 1609 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1610 PyObject *mapping, /* character mapping 1611 (unicode ordinal -> char ordinal) */ 1612 const char *errors /* error handling */ 1613 ); 1614PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( 1615 PyObject *unicode, /* Unicode object */ 1616 PyObject *mapping, /* character mapping 1617 (unicode ordinal -> char ordinal) */ 1618 const char *errors /* error handling */ 1619 ); 1620#endif 1621 1622/* Translate a Py_UNICODE buffer of the given length by applying a 1623 character mapping table to it and return the resulting Unicode 1624 object. 1625 1626 The mapping table must map Unicode ordinal integers to Unicode 1627 ordinal integers or None (causing deletion of the character). 1628 1629 Mapping tables may be dictionaries or sequences. Unmapped character 1630 ordinals (ones which cause a LookupError) are left untouched and 1631 are copied as-is. 1632 1633*/ 1634 1635#ifndef Py_LIMITED_API 1636PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1637 const Py_UNICODE *data, /* Unicode char buffer */ 1638 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1639 PyObject *table, /* Translate table */ 1640 const char *errors /* error handling */ 1641 ); 1642#endif 1643 1644#ifdef MS_WINDOWS 1645 1646/* --- MBCS codecs for Windows -------------------------------------------- */ 1647 1648PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1649 const char *string, /* MBCS encoded string */ 1650 Py_ssize_t length, /* size of string */ 1651 const char *errors /* error handling */ 1652 ); 1653 1654PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1655 const char *string, /* MBCS encoded string */ 1656 Py_ssize_t length, /* size of string */ 1657 const char *errors, /* error handling */ 1658 Py_ssize_t *consumed /* bytes consumed */ 1659 ); 1660 1661PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 1662 int code_page, /* code page number */ 1663 const char *string, /* encoded string */ 1664 Py_ssize_t length, /* size of string */ 1665 const char *errors, /* error handling */ 1666 Py_ssize_t *consumed /* bytes consumed */ 1667 ); 1668 1669PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1670 PyObject *unicode /* Unicode object */ 1671 ); 1672 1673#ifndef Py_LIMITED_API 1674PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1675 const Py_UNICODE *data, /* Unicode char buffer */ 1676 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1677 const char *errors /* error handling */ 1678 ); 1679#endif 1680 1681PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 1682 int code_page, /* code page number */ 1683 PyObject *unicode, /* Unicode object */ 1684 const char *errors /* error handling */ 1685 ); 1686 1687#endif /* MS_WINDOWS */ 1688 1689/* --- Decimal Encoder ---------------------------------------------------- */ 1690 1691/* Takes a Unicode string holding a decimal value and writes it into 1692 an output buffer using standard ASCII digit codes. 1693 1694 The output buffer has to provide at least length+1 bytes of storage 1695 area. The output string is 0-terminated. 1696 1697 The encoder converts whitespace to ' ', decimal characters to their 1698 corresponding ASCII digit and all other Latin-1 characters except 1699 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1700 are treated as errors. This includes embedded NULL bytes. 1701 1702 Error handling is defined by the errors argument: 1703 1704 NULL or "strict": raise a ValueError 1705 "ignore": ignore the wrong characters (these are not copied to the 1706 output buffer) 1707 "replace": replaces illegal characters with '?' 1708 1709 Returns 0 on success, -1 on failure. 1710 1711*/ 1712 1713#ifndef Py_LIMITED_API 1714PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1715 Py_UNICODE *s, /* Unicode buffer */ 1716 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1717 char *output, /* Output buffer; must have size >= length */ 1718 const char *errors /* error handling */ 1719 ); 1720#endif 1721 1722/* Transforms code points that have decimal digit property to the 1723 corresponding ASCII digit code points. 1724 1725 Returns a new Unicode string on success, NULL on failure. 1726*/ 1727 1728#ifndef Py_LIMITED_API 1729PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( 1730 Py_UNICODE *s, /* Unicode buffer */ 1731 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ 1732 ); 1733#endif 1734 1735/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject 1736 as argument instead of a raw buffer and length. This function additionally 1737 transforms spaces to ASCII because this is what the callers in longobject, 1738 floatobject, and complexobject did anyways. */ 1739 1740#ifndef Py_LIMITED_API 1741PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 1742 PyObject *unicode /* Unicode object */ 1743 ); 1744#endif 1745 1746/* --- Locale encoding --------------------------------------------------- */ 1747 1748/* Decode a string from the current locale encoding. The decoder is strict if 1749 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 1750 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 1751 be decoded as a surrogate character and *surrogateescape* is not equal to 1752 zero, the byte sequence is escaped using the 'surrogateescape' error handler 1753 instead of being decoded. *str* must end with a null character but cannot 1754 contain embedded null characters. */ 1755 1756PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 1757 const char *str, 1758 Py_ssize_t len, 1759 const char *errors); 1760 1761/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 1762 length using strlen(). */ 1763 1764PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 1765 const char *str, 1766 const char *errors); 1767 1768/* Encode a Unicode object to the current locale encoding. The encoder is 1769 strict is *surrogateescape* is equal to zero, otherwise the 1770 "surrogateescape" error handler is used. Return a bytes object. The string 1771 cannot contain embedded null characters. */ 1772 1773PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 1774 PyObject *unicode, 1775 const char *errors 1776 ); 1777 1778/* --- File system encoding ---------------------------------------------- */ 1779 1780/* ParseTuple converter: encode str objects to bytes using 1781 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1782 1783PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1784 1785/* ParseTuple converter: decode bytes objects to unicode using 1786 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1787 1788PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1789 1790/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1791 and the "surrogateescape" error handler. 1792 1793 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1794 encoding. 1795 1796 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1797*/ 1798 1799PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1800 const char *s /* encoded string */ 1801 ); 1802 1803/* Decode a string using Py_FileSystemDefaultEncoding 1804 and the "surrogateescape" error handler. 1805 1806 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1807 encoding. 1808*/ 1809 1810PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1811 const char *s, /* encoded string */ 1812 Py_ssize_t size /* size */ 1813 ); 1814 1815/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1816 "surrogateescape" error handler, and return bytes. 1817 1818 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1819 encoding. 1820*/ 1821 1822PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1823 PyObject *unicode 1824 ); 1825 1826/* --- Methods & Slots ---------------------------------------------------- 1827 1828 These are capable of handling Unicode objects and strings on input 1829 (we refer to them as strings in the descriptions) and return 1830 Unicode objects or integers as appropriate. */ 1831 1832/* Concat two strings giving a new Unicode string. */ 1833 1834PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1835 PyObject *left, /* Left string */ 1836 PyObject *right /* Right string */ 1837 ); 1838 1839/* Concat two strings and put the result in *pleft 1840 (sets *pleft to NULL on error) */ 1841 1842PyAPI_FUNC(void) PyUnicode_Append( 1843 PyObject **pleft, /* Pointer to left string */ 1844 PyObject *right /* Right string */ 1845 ); 1846 1847/* Concat two strings, put the result in *pleft and drop the right object 1848 (sets *pleft to NULL on error) */ 1849 1850PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1851 PyObject **pleft, /* Pointer to left string */ 1852 PyObject *right /* Right string */ 1853 ); 1854 1855/* Split a string giving a list of Unicode strings. 1856 1857 If sep is NULL, splitting will be done at all whitespace 1858 substrings. Otherwise, splits occur at the given separator. 1859 1860 At most maxsplit splits will be done. If negative, no limit is set. 1861 1862 Separators are not included in the resulting list. 1863 1864*/ 1865 1866PyAPI_FUNC(PyObject*) PyUnicode_Split( 1867 PyObject *s, /* String to split */ 1868 PyObject *sep, /* String separator */ 1869 Py_ssize_t maxsplit /* Maxsplit count */ 1870 ); 1871 1872/* Dito, but split at line breaks. 1873 1874 CRLF is considered to be one line break. Line breaks are not 1875 included in the resulting list. */ 1876 1877PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1878 PyObject *s, /* String to split */ 1879 int keepends /* If true, line end markers are included */ 1880 ); 1881 1882/* Partition a string using a given separator. */ 1883 1884PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1885 PyObject *s, /* String to partition */ 1886 PyObject *sep /* String separator */ 1887 ); 1888 1889/* Partition a string using a given separator, searching from the end of the 1890 string. */ 1891 1892PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1893 PyObject *s, /* String to partition */ 1894 PyObject *sep /* String separator */ 1895 ); 1896 1897/* Split a string giving a list of Unicode strings. 1898 1899 If sep is NULL, splitting will be done at all whitespace 1900 substrings. Otherwise, splits occur at the given separator. 1901 1902 At most maxsplit splits will be done. But unlike PyUnicode_Split 1903 PyUnicode_RSplit splits from the end of the string. If negative, 1904 no limit is set. 1905 1906 Separators are not included in the resulting list. 1907 1908*/ 1909 1910PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1911 PyObject *s, /* String to split */ 1912 PyObject *sep, /* String separator */ 1913 Py_ssize_t maxsplit /* Maxsplit count */ 1914 ); 1915 1916/* Translate a string by applying a character mapping table to it and 1917 return the resulting Unicode object. 1918 1919 The mapping table must map Unicode ordinal integers to Unicode 1920 ordinal integers or None (causing deletion of the character). 1921 1922 Mapping tables may be dictionaries or sequences. Unmapped character 1923 ordinals (ones which cause a LookupError) are left untouched and 1924 are copied as-is. 1925 1926*/ 1927 1928PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1929 PyObject *str, /* String */ 1930 PyObject *table, /* Translate table */ 1931 const char *errors /* error handling */ 1932 ); 1933 1934/* Join a sequence of strings using the given separator and return 1935 the resulting Unicode string. */ 1936 1937PyAPI_FUNC(PyObject*) PyUnicode_Join( 1938 PyObject *separator, /* Separator string */ 1939 PyObject *seq /* Sequence object */ 1940 ); 1941 1942#ifndef Py_LIMITED_API 1943PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray( 1944 PyObject *separator, 1945 PyObject **items, 1946 Py_ssize_t seqlen 1947 ); 1948#endif /* Py_LIMITED_API */ 1949 1950/* Return 1 if substr matches str[start:end] at the given tail end, 0 1951 otherwise. */ 1952 1953PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1954 PyObject *str, /* String */ 1955 PyObject *substr, /* Prefix or Suffix string */ 1956 Py_ssize_t start, /* Start index */ 1957 Py_ssize_t end, /* Stop index */ 1958 int direction /* Tail end: -1 prefix, +1 suffix */ 1959 ); 1960 1961/* Return the first position of substr in str[start:end] using the 1962 given search direction or -1 if not found. -2 is returned in case 1963 an error occurred and an exception is set. */ 1964 1965PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1966 PyObject *str, /* String */ 1967 PyObject *substr, /* Substring to find */ 1968 Py_ssize_t start, /* Start index */ 1969 Py_ssize_t end, /* Stop index */ 1970 int direction /* Find direction: +1 forward, -1 backward */ 1971 ); 1972 1973/* Like PyUnicode_Find, but search for single character only. */ 1974PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 1975 PyObject *str, 1976 Py_UCS4 ch, 1977 Py_ssize_t start, 1978 Py_ssize_t end, 1979 int direction 1980 ); 1981 1982/* Count the number of occurrences of substr in str[start:end]. */ 1983 1984PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1985 PyObject *str, /* String */ 1986 PyObject *substr, /* Substring to count */ 1987 Py_ssize_t start, /* Start index */ 1988 Py_ssize_t end /* Stop index */ 1989 ); 1990 1991/* Replace at most maxcount occurrences of substr in str with replstr 1992 and return the resulting Unicode object. */ 1993 1994PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1995 PyObject *str, /* String */ 1996 PyObject *substr, /* Substring to find */ 1997 PyObject *replstr, /* Substring to replace */ 1998 Py_ssize_t maxcount /* Max. number of replacements to apply; 1999 -1 = all */ 2000 ); 2001 2002/* Compare two strings and return -1, 0, 1 for less than, equal, 2003 greater than resp. 2004 Raise an exception and return -1 on error. */ 2005 2006PyAPI_FUNC(int) PyUnicode_Compare( 2007 PyObject *left, /* Left string */ 2008 PyObject *right /* Right string */ 2009 ); 2010 2011#ifndef Py_LIMITED_API 2012PyAPI_FUNC(int) _PyUnicode_CompareWithId( 2013 PyObject *left, /* Left string */ 2014 _Py_Identifier *right /* Right identifier */ 2015 ); 2016#endif 2017 2018PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 2019 PyObject *left, 2020 const char *right /* ASCII-encoded string */ 2021 ); 2022 2023/* Rich compare two strings and return one of the following: 2024 2025 - NULL in case an exception was raised 2026 - Py_True or Py_False for successful comparisons 2027 - Py_NotImplemented in case the type combination is unknown 2028 2029 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 2030 case the conversion of the arguments to Unicode fails with a 2031 UnicodeDecodeError. 2032 2033 Possible values for op: 2034 2035 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 2036 2037*/ 2038 2039PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 2040 PyObject *left, /* Left string */ 2041 PyObject *right, /* Right string */ 2042 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 2043 ); 2044 2045/* Apply an argument tuple or dictionary to a format string and return 2046 the resulting Unicode string. */ 2047 2048PyAPI_FUNC(PyObject *) PyUnicode_Format( 2049 PyObject *format, /* Format string */ 2050 PyObject *args /* Argument tuple or dictionary */ 2051 ); 2052 2053/* Checks whether element is contained in container and return 1/0 2054 accordingly. 2055 2056 element has to coerce to a one element Unicode string. -1 is 2057 returned in case of an error. */ 2058 2059PyAPI_FUNC(int) PyUnicode_Contains( 2060 PyObject *container, /* Container string */ 2061 PyObject *element /* Element string */ 2062 ); 2063 2064/* Checks whether argument is a valid identifier. */ 2065 2066PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 2067 2068#ifndef Py_LIMITED_API 2069/* Externally visible for str.strip(unicode) */ 2070PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 2071 PyObject *self, 2072 int striptype, 2073 PyObject *sepobj 2074 ); 2075#endif 2076 2077/* Using explicit passed-in values, insert the thousands grouping 2078 into the string pointed to by buffer. For the argument descriptions, 2079 see Objects/stringlib/localeutil.h */ 2080#ifndef Py_LIMITED_API 2081PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 2082 PyObject *unicode, 2083 Py_ssize_t index, 2084 Py_ssize_t n_buffer, 2085 void *digits, 2086 Py_ssize_t n_digits, 2087 Py_ssize_t min_width, 2088 const char *grouping, 2089 PyObject *thousands_sep, 2090 Py_UCS4 *maxchar); 2091#endif 2092/* === Characters Type APIs =============================================== */ 2093 2094/* Helper array used by Py_UNICODE_ISSPACE(). */ 2095 2096#ifndef Py_LIMITED_API 2097PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 2098 2099/* These should not be used directly. Use the Py_UNICODE_IS* and 2100 Py_UNICODE_TO* macros instead. 2101 2102 These APIs are implemented in Objects/unicodectype.c. 2103 2104*/ 2105 2106PyAPI_FUNC(int) _PyUnicode_IsLowercase( 2107 Py_UCS4 ch /* Unicode character */ 2108 ); 2109 2110PyAPI_FUNC(int) _PyUnicode_IsUppercase( 2111 Py_UCS4 ch /* Unicode character */ 2112 ); 2113 2114PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 2115 Py_UCS4 ch /* Unicode character */ 2116 ); 2117 2118PyAPI_FUNC(int) _PyUnicode_IsXidStart( 2119 Py_UCS4 ch /* Unicode character */ 2120 ); 2121 2122PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 2123 Py_UCS4 ch /* Unicode character */ 2124 ); 2125 2126PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 2127 const Py_UCS4 ch /* Unicode character */ 2128 ); 2129 2130PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 2131 const Py_UCS4 ch /* Unicode character */ 2132 ); 2133 2134PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 2135 Py_UCS4 ch /* Unicode character */ 2136 ); 2137 2138PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 2139 Py_UCS4 ch /* Unicode character */ 2140 ); 2141 2142PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 2143 Py_UCS4 ch /* Unicode character */ 2144 ); 2145 2146PyAPI_FUNC(int) _PyUnicode_ToLowerFull( 2147 Py_UCS4 ch, /* Unicode character */ 2148 Py_UCS4 *res 2149 ); 2150 2151PyAPI_FUNC(int) _PyUnicode_ToTitleFull( 2152 Py_UCS4 ch, /* Unicode character */ 2153 Py_UCS4 *res 2154 ); 2155 2156PyAPI_FUNC(int) _PyUnicode_ToUpperFull( 2157 Py_UCS4 ch, /* Unicode character */ 2158 Py_UCS4 *res 2159 ); 2160 2161PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( 2162 Py_UCS4 ch, /* Unicode character */ 2163 Py_UCS4 *res 2164 ); 2165 2166PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( 2167 Py_UCS4 ch /* Unicode character */ 2168 ); 2169 2170PyAPI_FUNC(int) _PyUnicode_IsCased( 2171 Py_UCS4 ch /* Unicode character */ 2172 ); 2173 2174PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 2175 Py_UCS4 ch /* Unicode character */ 2176 ); 2177 2178PyAPI_FUNC(int) _PyUnicode_ToDigit( 2179 Py_UCS4 ch /* Unicode character */ 2180 ); 2181 2182PyAPI_FUNC(double) _PyUnicode_ToNumeric( 2183 Py_UCS4 ch /* Unicode character */ 2184 ); 2185 2186PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 2187 Py_UCS4 ch /* Unicode character */ 2188 ); 2189 2190PyAPI_FUNC(int) _PyUnicode_IsDigit( 2191 Py_UCS4 ch /* Unicode character */ 2192 ); 2193 2194PyAPI_FUNC(int) _PyUnicode_IsNumeric( 2195 Py_UCS4 ch /* Unicode character */ 2196 ); 2197 2198PyAPI_FUNC(int) _PyUnicode_IsPrintable( 2199 Py_UCS4 ch /* Unicode character */ 2200 ); 2201 2202PyAPI_FUNC(int) _PyUnicode_IsAlpha( 2203 Py_UCS4 ch /* Unicode character */ 2204 ); 2205 2206PyAPI_FUNC(size_t) Py_UNICODE_strlen( 2207 const Py_UNICODE *u 2208 ); 2209 2210PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 2211 Py_UNICODE *s1, 2212 const Py_UNICODE *s2); 2213 2214PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 2215 Py_UNICODE *s1, const Py_UNICODE *s2); 2216 2217PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 2218 Py_UNICODE *s1, 2219 const Py_UNICODE *s2, 2220 size_t n); 2221 2222PyAPI_FUNC(int) Py_UNICODE_strcmp( 2223 const Py_UNICODE *s1, 2224 const Py_UNICODE *s2 2225 ); 2226 2227PyAPI_FUNC(int) Py_UNICODE_strncmp( 2228 const Py_UNICODE *s1, 2229 const Py_UNICODE *s2, 2230 size_t n 2231 ); 2232 2233PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 2234 const Py_UNICODE *s, 2235 Py_UNICODE c 2236 ); 2237 2238PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 2239 const Py_UNICODE *s, 2240 Py_UNICODE c 2241 ); 2242 2243PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int); 2244 2245/* Create a copy of a unicode string ending with a nul character. Return NULL 2246 and raise a MemoryError exception on memory allocation failure, otherwise 2247 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 2248 2249PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 2250 PyObject *unicode 2251 ); 2252#endif /* Py_LIMITED_API */ 2253 2254#if defined(Py_DEBUG) && !defined(Py_LIMITED_API) 2255PyAPI_FUNC(int) _PyUnicode_CheckConsistency( 2256 PyObject *op, 2257 int check_content); 2258#endif 2259 2260#ifndef Py_LIMITED_API 2261/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ 2262PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); 2263/* Clear all static strings. */ 2264PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); 2265 2266/* Fast equality check when the inputs are known to be exact unicode types 2267 and where the hash values are equal (i.e. a very probable match) */ 2268PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); 2269#endif /* !Py_LIMITED_API */ 2270 2271#ifdef __cplusplus 2272} 2273#endif 2274#endif /* !Py_UNICODEOBJECT_H */ 2275