unicodeobject.h revision 90db9c47dca4d105835386fc57d46472b0836820
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal. (See 11http://www.egenix.com/files/python/unicode-proposal.txt). 12 13Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58#include <ctype.h> 59 60/* === Internal API ======================================================= */ 61 62/* --- Internal Unicode Format -------------------------------------------- */ 63 64/* Python 3.x requires unicode */ 65#define Py_USING_UNICODE 66 67#ifndef SIZEOF_WCHAR_T 68#error Must define SIZEOF_WCHAR_T 69#endif 70 71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Py_UNICODE was the native Unicode storage format (code unit) used by 87 Python and represents a single Unicode element in the Unicode type. 88 With PEP 393, Py_UNICODE is deprecated and replaced with a 89 typedef to wchar_t. */ 90 91#ifndef Py_LIMITED_API 92#define PY_UNICODE_TYPE wchar_t 93typedef wchar_t Py_UNICODE; 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar(), 98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#if defined(MS_WINDOWS) 107# define HAVE_MBCS 108#endif 109 110#ifdef HAVE_WCHAR_H 111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 112# ifdef _HAVE_BSDI 113# include <time.h> 114# endif 115# include <wchar.h> 116#endif 117 118/* Py_UCS4 and Py_UCS2 are typedefs for the respective 119 unicode representations. */ 120#if SIZEOF_INT == 4 121typedef unsigned int Py_UCS4; 122#elif SIZEOF_LONG == 4 123typedef unsigned long Py_UCS4; 124#else 125#error "Could not find a proper typedef for Py_UCS4" 126#endif 127 128#if SIZEOF_SHORT == 2 129typedef unsigned short Py_UCS2; 130#else 131#error "Could not find a proper typedef for Py_UCS2" 132#endif 133 134typedef unsigned char Py_UCS1; 135 136/* --- Internal Unicode Operations ---------------------------------------- */ 137 138/* Since splitting on whitespace is an important use case, and 139 whitespace in most situations is solely ASCII whitespace, we 140 optimize for the common case by using a quick look-up table 141 _Py_ascii_whitespace (see below) with an inlined check. 142 143 */ 144#ifndef Py_LIMITED_API 145#define Py_UNICODE_ISSPACE(ch) \ 146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 147 148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 152 153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 156 157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 161 162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 165 166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 167 168#define Py_UNICODE_ISALNUM(ch) \ 169 (Py_UNICODE_ISALPHA(ch) || \ 170 Py_UNICODE_ISDECIMAL(ch) || \ 171 Py_UNICODE_ISDIGIT(ch) || \ 172 Py_UNICODE_ISNUMERIC(ch)) 173 174#define Py_UNICODE_COPY(target, source, length) \ 175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 176 177#define Py_UNICODE_FILL(target, value, length) \ 178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 180 } while (0) 181 182/* macros to work with surrogates */ 183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF) 184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 186/* Join two surrogate characters and return a single Py_UCS4 value. */ 187#define Py_UNICODE_JOIN_SURROGATES(high, low) \ 188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 190/* high surrogate = top 10 bits added to D800 */ 191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) 192/* low surrogate = bottom 10 bits added to DC00 */ 193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) 194 195/* Check if substring matches at given offset. The offset must be 196 valid, and the substring must not be empty. */ 197 198#define Py_UNICODE_MATCH(string, offset, substring) \ 199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ 200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ 201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) 202 203#endif /* Py_LIMITED_API */ 204 205#ifdef __cplusplus 206extern "C" { 207#endif 208 209/* --- Unicode Type ------------------------------------------------------- */ 210 211#ifndef Py_LIMITED_API 212 213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 214 structure. state.ascii and state.compact are set, and the data 215 immediately follow the structure. utf8_length and wstr_length can be found 216 in the length field; the utf8 pointer is equal to the data pointer. */ 217typedef struct { 218 /* There are 4 forms of Unicode strings: 219 220 - compact ascii: 221 222 * structure = PyASCIIObject 223 * test: PyUnicode_IS_COMPACT_ASCII(op) 224 * kind = PyUnicode_1BYTE_KIND 225 * compact = 1 226 * ascii = 1 227 * ready = 1 228 * (length is the length of the utf8 and wstr strings) 229 * (data starts just after the structure) 230 * (since ASCII is decoded from UTF-8, the utf8 string are the data) 231 232 - compact: 233 234 * structure = PyCompactUnicodeObject 235 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) 236 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 237 PyUnicode_4BYTE_KIND 238 * compact = 1 239 * ready = 1 240 * ascii = 0 241 * utf8 is not shared with data 242 * utf8_length = 0 if utf8 is NULL 243 * wstr is shared with data and wstr_length=length 244 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 245 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 246 * wstr_length = 0 if wstr is NULL 247 * (data starts just after the structure) 248 249 - legacy string, not ready: 250 251 * structure = PyUnicodeObject 252 * test: kind == PyUnicode_WCHAR_KIND 253 * length = 0 (use wstr_length) 254 * hash = -1 255 * kind = PyUnicode_WCHAR_KIND 256 * compact = 0 257 * ascii = 0 258 * ready = 0 259 * interned = SSTATE_NOT_INTERNED 260 * wstr is not NULL 261 * data.any is NULL 262 * utf8 is NULL 263 * utf8_length = 0 264 265 - legacy string, ready: 266 267 * structure = PyUnicodeObject structure 268 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND 269 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 270 PyUnicode_4BYTE_KIND 271 * compact = 0 272 * ready = 1 273 * data.any is not NULL 274 * utf8 is shared and utf8_length = length with data.any if ascii = 1 275 * utf8_length = 0 if utf8 is NULL 276 * wstr is shared with data.any and wstr_length = length 277 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 278 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 279 * wstr_length = 0 if wstr is NULL 280 281 Compact strings use only one memory block (structure + characters), 282 whereas legacy strings use one block for the structure and one block 283 for characters. 284 285 Legacy strings are created by PyUnicode_FromUnicode() and 286 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready 287 when PyUnicode_READY() is called. 288 289 See also _PyUnicode_CheckConsistency(). 290 */ 291 PyObject_HEAD 292 Py_ssize_t length; /* Number of code points in the string */ 293 Py_hash_t hash; /* Hash value; -1 if not set */ 294 struct { 295 /* 296 SSTATE_NOT_INTERNED (0) 297 SSTATE_INTERNED_MORTAL (1) 298 SSTATE_INTERNED_IMMORTAL (2) 299 300 If interned != SSTATE_NOT_INTERNED, the two references from the 301 dictionary to this object are *not* counted in ob_refcnt. 302 */ 303 unsigned int interned:2; 304 /* Character size: 305 306 - PyUnicode_WCHAR_KIND (0): 307 308 * character type = wchar_t (16 or 32 bits, depending on the 309 platform) 310 311 - PyUnicode_1BYTE_KIND (1): 312 313 * character type = Py_UCS1 (8 bits, unsigned) 314 * all characters are in the range U+0000-U+00FF (latin1) 315 * if ascii is set, all characters are in the range U+0000-U+007F 316 (ASCII), otherwise at least one character is in the range 317 U+0080-U+00FF 318 319 - PyUnicode_2BYTE_KIND (2): 320 321 * character type = Py_UCS2 (16 bits, unsigned) 322 * all characters are in the range U+0000-U+FFFF (BMP) 323 * at least one character is in the range U+0100-U+FFFF 324 325 - PyUnicode_4BYTE_KIND (4): 326 327 * character type = Py_UCS4 (32 bits, unsigned) 328 * all characters are in the range U+0000-U+10FFFF 329 * at least one character is in the range U+10000-U+10FFFF 330 */ 331 unsigned int kind:3; 332 /* Compact is with respect to the allocation scheme. Compact unicode 333 objects only require one memory block while non-compact objects use 334 one block for the PyUnicodeObject struct and another for its data 335 buffer. */ 336 unsigned int compact:1; 337 /* The string only contains characters in the range U+0000-U+007F (ASCII) 338 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is 339 set, use the PyASCIIObject structure. */ 340 unsigned int ascii:1; 341 /* The ready flag indicates whether the object layout is initialized 342 completely. This means that this is either a compact object, or 343 the data pointer is filled out. The bit is redundant, and helps 344 to minimize the test in PyUnicode_IS_READY(). */ 345 unsigned int ready:1; 346 } state; 347 wchar_t *wstr; /* wchar_t representation (null-terminated) */ 348} PyASCIIObject; 349 350/* Non-ASCII strings allocated through PyUnicode_New use the 351 PyCompactUnicodeObject structure. state.compact is set, and the data 352 immediately follow the structure. */ 353typedef struct { 354 PyASCIIObject _base; 355 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 356 * terminating \0. */ 357 char *utf8; /* UTF-8 representation (null-terminated) */ 358 Py_ssize_t wstr_length; /* Number of code points in wstr, possible 359 * surrogates count as two code points. */ 360} PyCompactUnicodeObject; 361 362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the 363 PyUnicodeObject structure. The actual string data is initially in the wstr 364 block, and copied into the data block using _PyUnicode_Ready. */ 365typedef struct { 366 PyCompactUnicodeObject _base; 367 union { 368 void *any; 369 Py_UCS1 *latin1; 370 Py_UCS2 *ucs2; 371 Py_UCS4 *ucs4; 372 } data; /* Canonical, smallest-form Unicode buffer */ 373} PyUnicodeObject; 374#endif 375 376PyAPI_DATA(PyTypeObject) PyUnicode_Type; 377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 378 379#define PyUnicode_Check(op) \ 380 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 382 383/* Fast access macros */ 384#ifndef Py_LIMITED_API 385 386#define PyUnicode_WSTR_LENGTH(op) \ 387 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 388 ((PyASCIIObject*)op)->length : \ 389 ((PyCompactUnicodeObject*)op)->wstr_length) 390 391/* Returns the deprecated Py_UNICODE representation's size in code units 392 (this includes surrogate pairs as 2 units). 393 If the Py_UNICODE representation is not available, it will be computed 394 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ 395 396#define PyUnicode_GET_SIZE(op) \ 397 (assert(PyUnicode_Check(op)), \ 398 (((PyASCIIObject *)(op))->wstr) ? \ 399 PyUnicode_WSTR_LENGTH(op) : \ 400 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ 401 assert(((PyASCIIObject *)(op))->wstr), \ 402 PyUnicode_WSTR_LENGTH(op))) 403 404#define PyUnicode_GET_DATA_SIZE(op) \ 405 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) 406 407/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE 408 representation on demand. Using this macro is very inefficient now, 409 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or 410 use PyUnicode_WRITE() and PyUnicode_READ(). */ 411 412#define PyUnicode_AS_UNICODE(op) \ 413 (assert(PyUnicode_Check(op)), \ 414 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ 415 PyUnicode_AsUnicode((PyObject *)(op))) 416 417#define PyUnicode_AS_DATA(op) \ 418 ((const char *)(PyUnicode_AS_UNICODE(op))) 419 420 421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ 422 423/* Values for PyASCIIObject.state: */ 424 425/* Interning state. */ 426#define SSTATE_NOT_INTERNED 0 427#define SSTATE_INTERNED_MORTAL 1 428#define SSTATE_INTERNED_IMMORTAL 2 429 430/* Return true if the string contains only ASCII characters, or 0 if not. The 431 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be 432 ready. */ 433#define PyUnicode_IS_ASCII(op) \ 434 (assert(PyUnicode_Check(op)), \ 435 assert(PyUnicode_IS_READY(op)), \ 436 ((PyASCIIObject*)op)->state.ascii) 437 438/* Return true if the string is compact or 0 if not. 439 No type checks or Ready calls are performed. */ 440#define PyUnicode_IS_COMPACT(op) \ 441 (((PyASCIIObject*)(op))->state.compact) 442 443/* Return true if the string is a compact ASCII string (use PyASCIIObject 444 structure), or 0 if not. No type checks or Ready calls are performed. */ 445#define PyUnicode_IS_COMPACT_ASCII(op) \ 446 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op)) 447 448enum PyUnicode_Kind { 449/* String contains only wstr byte characters. This is only possible 450 when the string was created with a legacy API and _PyUnicode_Ready() 451 has not been called yet. */ 452 PyUnicode_WCHAR_KIND = 0, 453/* Return values of the PyUnicode_KIND() macro: */ 454 PyUnicode_1BYTE_KIND = 1, 455 PyUnicode_2BYTE_KIND = 2, 456 PyUnicode_4BYTE_KIND = 4 457}; 458 459/* Return pointers to the canonical representation cast to unsigned char, 460 Py_UCS2, or Py_UCS4 for direct character access. 461 No checks are performed, use PyUnicode_KIND() before to ensure 462 these will work correctly. */ 463 464#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) 465#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) 466#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) 467 468/* Return one of the PyUnicode_*_KIND values defined above. */ 469#define PyUnicode_KIND(op) \ 470 (assert(PyUnicode_Check(op)), \ 471 assert(PyUnicode_IS_READY(op)), \ 472 ((PyASCIIObject *)(op))->state.kind) 473 474/* Return a void pointer to the raw unicode buffer. */ 475#define _PyUnicode_COMPACT_DATA(op) \ 476 (PyUnicode_IS_ASCII(op) ? \ 477 ((void*)((PyASCIIObject*)(op) + 1)) : \ 478 ((void*)((PyCompactUnicodeObject*)(op) + 1))) 479 480#define _PyUnicode_NONCOMPACT_DATA(op) \ 481 (assert(((PyUnicodeObject*)(op))->data.any), \ 482 ((((PyUnicodeObject *)(op))->data.any))) 483 484#define PyUnicode_DATA(op) \ 485 (assert(PyUnicode_Check(op)), \ 486 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ 487 _PyUnicode_NONCOMPACT_DATA(op)) 488 489/* In the access macros below, "kind" may be evaluated more than once. 490 All other macro parameters are evaluated exactly once, so it is safe 491 to put side effects into them (such as increasing the index). */ 492 493/* Write into the canonical representation, this macro does not do any sanity 494 checks and is intended for usage in loops. The caller should cache the 495 kind and data pointers obtained from other macro calls. 496 index is the index in the string (starts at 0) and value is the new 497 code point value which should be written to that location. */ 498#define PyUnicode_WRITE(kind, data, index, value) \ 499 do { \ 500 switch ((kind)) { \ 501 case PyUnicode_1BYTE_KIND: { \ 502 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ 503 break; \ 504 } \ 505 case PyUnicode_2BYTE_KIND: { \ 506 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ 507 break; \ 508 } \ 509 default: { \ 510 assert((kind) == PyUnicode_4BYTE_KIND); \ 511 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ 512 } \ 513 } \ 514 } while (0) 515 516/* Read a code point from the string's canonical representation. No checks 517 or ready calls are performed. */ 518#define PyUnicode_READ(kind, data, index) \ 519 ((Py_UCS4) \ 520 ((kind) == PyUnicode_1BYTE_KIND ? \ 521 ((const Py_UCS1 *)(data))[(index)] : \ 522 ((kind) == PyUnicode_2BYTE_KIND ? \ 523 ((const Py_UCS2 *)(data))[(index)] : \ 524 ((const Py_UCS4 *)(data))[(index)] \ 525 ) \ 526 )) 527 528/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 529 calls PyUnicode_KIND() and might call it twice. For single reads, use 530 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 531 cache kind and use PyUnicode_READ instead. */ 532#define PyUnicode_READ_CHAR(unicode, index) \ 533 (assert(PyUnicode_Check(unicode)), \ 534 assert(PyUnicode_IS_READY(unicode)), \ 535 (Py_UCS4) \ 536 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ 537 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ 538 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ 539 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ 540 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ 541 ) \ 542 )) 543 544/* Returns the length of the unicode string. The caller has to make sure that 545 the string has it's canonical representation set before calling 546 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ 547#define PyUnicode_GET_LENGTH(op) \ 548 (assert(PyUnicode_Check(op)), \ 549 assert(PyUnicode_IS_READY(op)), \ 550 ((PyASCIIObject *)(op))->length) 551 552 553/* Fast check to determine whether an object is ready. Equivalent to 554 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ 555 556#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) 557 558/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best 559 case. If the canonical representation is not yet set, it will still call 560 _PyUnicode_Ready(). 561 Returns 0 on success and -1 on errors. */ 562#define PyUnicode_READY(op) \ 563 (assert(PyUnicode_Check(op)), \ 564 (PyUnicode_IS_READY(op) ? \ 565 0 : _PyUnicode_Ready((PyObject *)(op)))) 566 567/* Return a maximum character value which is suitable for creating another 568 string based on op. This is always an approximation but more efficient 569 than iterating over the string. */ 570#define PyUnicode_MAX_CHAR_VALUE(op) \ 571 (assert(PyUnicode_IS_READY(op)), \ 572 (PyUnicode_IS_ASCII(op) ? \ 573 (0x7f) : \ 574 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ 575 (0xffU) : \ 576 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ 577 (0xffffU) : \ 578 (0x10ffffU))))) 579 580#endif 581 582/* --- Constants ---------------------------------------------------------- */ 583 584/* This Unicode character will be used as replacement character during 585 decoding if the errors argument is set to "replace". Note: the 586 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 587 Unicode 3.0. */ 588 589#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 590 591/* === Public API ========================================================= */ 592 593/* --- Plain Py_UNICODE --------------------------------------------------- */ 594 595/* With PEP 393, this is the recommended way to allocate a new unicode object. 596 This function will allocate the object and its buffer in a single memory 597 block. Objects created using this function are not resizable. */ 598#ifndef Py_LIMITED_API 599PyAPI_FUNC(PyObject*) PyUnicode_New( 600 Py_ssize_t size, /* Number of code points in the new string */ 601 Py_UCS4 maxchar /* maximum code point value in the string */ 602 ); 603#endif 604 605/* Initializes the canonical string representation from a the deprecated 606 wstr/Py_UNICODE representation. This function is used to convert Unicode 607 objects which were created using the old API to the new flexible format 608 introduced with PEP 393. 609 610 Don't call this function directly, use the public PyUnicode_READY() macro 611 instead. */ 612#ifndef Py_LIMITED_API 613PyAPI_FUNC(int) _PyUnicode_Ready( 614 PyObject *unicode /* Unicode object */ 615 ); 616#endif 617 618/* Get a copy of a Unicode string. */ 619#ifndef Py_LIMITED_API 620PyAPI_FUNC(PyObject*) _PyUnicode_Copy( 621 PyObject *unicode 622 ); 623#endif 624 625/* Copy character from one unicode object into another, this function performs 626 character conversion when necessary and falls back to memcpy() if possible. 627 628 Fail if to is too small (smaller than *how_many* or smaller than 629 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 630 kind(to), or if *to* has more than 1 reference. 631 632 Return the number of written character, or return -1 and raise an exception 633 on error. 634 635 Pseudo-code: 636 637 how_many = min(how_many, len(from) - from_start) 638 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 639 return how_many 640 641 Note: The function doesn't write a terminating null character. 642 */ 643#ifndef Py_LIMITED_API 644PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 645 PyObject *to, 646 Py_ssize_t to_start, 647 PyObject *from, 648 Py_ssize_t from_start, 649 Py_ssize_t how_many 650 ); 651 652/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so 653 may crash if parameters are invalid (e.g. if the output string 654 is too short). */ 655PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters( 656 PyObject *to, 657 Py_ssize_t to_start, 658 PyObject *from, 659 Py_ssize_t from_start, 660 Py_ssize_t how_many 661 ); 662#endif 663 664#ifndef Py_LIMITED_API 665/* Fill a string with a character: write fill_char into 666 unicode[start:start+length]. 667 668 Fail if fill_char is bigger than the string maximum character, or if the 669 string has more than 1 reference. 670 671 Return the number of written character, or return -1 and raise an exception 672 on error. */ 673PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( 674 PyObject *unicode, 675 Py_ssize_t start, 676 Py_ssize_t length, 677 Py_UCS4 fill_char 678 ); 679 680/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash 681 if parameters are invalid (e.g. if length is longer than the string). */ 682PyAPI_FUNC(void) _PyUnicode_FastFill( 683 PyObject *unicode, 684 Py_ssize_t start, 685 Py_ssize_t length, 686 Py_UCS4 fill_char 687 ); 688#endif 689 690/* Create a Unicode Object from the Py_UNICODE buffer u of the given 691 size. 692 693 u may be NULL which causes the contents to be undefined. It is the 694 user's responsibility to fill in the needed data afterwards. Note 695 that modifying the Unicode object contents after construction is 696 only allowed if u was set to NULL. 697 698 The buffer is copied into the new object. */ 699 700#ifndef Py_LIMITED_API 701PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 702 const Py_UNICODE *u, /* Unicode buffer */ 703 Py_ssize_t size /* size of buffer */ 704 ); 705#endif 706 707/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 708PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 709 const char *u, /* UTF-8 encoded string */ 710 Py_ssize_t size /* size of buffer */ 711 ); 712 713/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 714 UTF-8 encoded bytes. The size is determined with strlen(). */ 715PyAPI_FUNC(PyObject*) PyUnicode_FromString( 716 const char *u /* UTF-8 encoded string */ 717 ); 718 719#ifndef Py_LIMITED_API 720/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. 721 Scan the string to find the maximum character. */ 722PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 723 int kind, 724 const void *buffer, 725 Py_ssize_t size); 726 727/* Create a new string from a buffer of ASCII characters. 728 WARNING: Don't check if the string contains any non-ASCII character. */ 729PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII( 730 const char *buffer, 731 Py_ssize_t size); 732#endif 733 734PyAPI_FUNC(PyObject*) PyUnicode_Substring( 735 PyObject *str, 736 Py_ssize_t start, 737 Py_ssize_t end); 738 739#ifndef Py_LIMITED_API 740/* Compute the maximum character of the substring unicode[start:end]. 741 Return 127 for an empty string. */ 742PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( 743 PyObject *unicode, 744 Py_ssize_t start, 745 Py_ssize_t end); 746#endif 747 748/* Copy the string into a UCS4 buffer including the null character if copy_null 749 is set. Return NULL and raise an exception on error. Raise a ValueError if 750 the buffer is smaller than the string. Return buffer on success. 751 752 buflen is the length of the buffer in (Py_UCS4) characters. */ 753PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 754 PyObject *unicode, 755 Py_UCS4* buffer, 756 Py_ssize_t buflen, 757 int copy_null); 758 759/* Copy the string into a UCS4 buffer. A new buffer is allocated using 760 * PyMem_Malloc; if this fails, NULL is returned with a memory error 761 exception set. */ 762PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 763 764/* Return a read-only pointer to the Unicode object's internal 765 Py_UNICODE buffer. 766 If the wchar_t/Py_UNICODE representation is not yet available, this 767 function will calculate it. */ 768 769#ifndef Py_LIMITED_API 770PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 771 PyObject *unicode /* Unicode object */ 772 ); 773#endif 774 775/* Return a read-only pointer to the Unicode object's internal 776 Py_UNICODE buffer and save the length at size. 777 If the wchar_t/Py_UNICODE representation is not yet available, this 778 function will calculate it. */ 779 780#ifndef Py_LIMITED_API 781PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( 782 PyObject *unicode, /* Unicode object */ 783 Py_ssize_t *size /* location where to save the length */ 784 ); 785#endif 786 787/* Get the length of the Unicode object. */ 788 789PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 790 PyObject *unicode 791); 792 793/* Get the number of Py_UNICODE units in the 794 string representation. */ 795 796PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 797 PyObject *unicode /* Unicode object */ 798 ); 799 800/* Read a character from the string. */ 801 802PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 803 PyObject *unicode, 804 Py_ssize_t index 805 ); 806 807/* Write a character to the string. The string must have been created through 808 PyUnicode_New, must not be shared, and must not have been hashed yet. 809 810 Return 0 on success, -1 on error. */ 811 812PyAPI_FUNC(int) PyUnicode_WriteChar( 813 PyObject *unicode, 814 Py_ssize_t index, 815 Py_UCS4 character 816 ); 817 818#ifndef Py_LIMITED_API 819/* Get the maximum ordinal for a Unicode character. */ 820PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 821#endif 822 823/* Resize an Unicode object. The length is the number of characters, except 824 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 825 is the number of Py_UNICODE characters. 826 827 *unicode is modified to point to the new (resized) object and 0 828 returned on success. 829 830 Try to resize the string in place (which is usually faster than allocating 831 a new string and copy characters), or create a new string. 832 833 Error handling is implemented as follows: an exception is set, -1 834 is returned and *unicode left untouched. 835 836 WARNING: The function doesn't check string content, the result may not be a 837 string in canonical representation. */ 838 839PyAPI_FUNC(int) PyUnicode_Resize( 840 PyObject **unicode, /* Pointer to the Unicode object */ 841 Py_ssize_t length /* New length */ 842 ); 843 844/* Coerce obj to an Unicode object and return a reference with 845 *incremented* refcount. 846 847 Coercion is done in the following way: 848 849 1. bytes, bytearray and other char buffer compatible objects are decoded 850 under the assumptions that they contain data using the UTF-8 851 encoding. Decoding is done in "strict" mode. 852 853 2. All other objects (including Unicode objects) raise an 854 exception. 855 856 The API returns NULL in case of an error. The caller is responsible 857 for decref'ing the returned objects. 858 859*/ 860 861PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 862 register PyObject *obj, /* Object */ 863 const char *encoding, /* encoding */ 864 const char *errors /* error handling */ 865 ); 866 867/* Coerce obj to an Unicode object and return a reference with 868 *incremented* refcount. 869 870 Unicode objects are passed back as-is (subclasses are converted to 871 true Unicode objects), all other objects are delegated to 872 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 873 using UTF-8 encoding as basis for decoding the object. 874 875 The API returns NULL in case of an error. The caller is responsible 876 for decref'ing the returned objects. 877 878*/ 879 880PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 881 register PyObject *obj /* Object */ 882 ); 883 884PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 885 const char *format, /* ASCII-encoded string */ 886 va_list vargs 887 ); 888PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 889 const char *format, /* ASCII-encoded string */ 890 ... 891 ); 892 893#ifndef Py_LIMITED_API 894typedef struct { 895 PyObject *buffer; 896 void *data; 897 enum PyUnicode_Kind kind; 898 Py_UCS4 maxchar; 899 Py_ssize_t size; 900 Py_ssize_t pos; 901 /* minimum length of the buffer when overallocation is enabled, 902 see _PyUnicodeWriter_Init() */ 903 Py_ssize_t min_length; 904 unsigned char overallocate; 905 /* If readonly is 1, buffer is a shared string (cannot be modified) 906 and size is set to 0. */ 907 unsigned char readonly; 908} _PyUnicodeWriter ; 909 910/* Initialize a Unicode writer. 911 912 If min_length is greater than zero, _PyUnicodeWriter_Prepare() 913 overallocates the buffer and min_length is the minimum length in characters 914 of the buffer. */ 915PyAPI_FUNC(void) 916_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length); 917 918/* Prepare the buffer to write 'length' characters 919 with the specified maximum character. 920 921 Return 0 on success, raise an exception and return -1 on error. */ 922#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ 923 (((MAXCHAR) <= (WRITER)->maxchar \ 924 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ 925 ? 0 \ 926 : (((LENGTH) == 0) \ 927 ? 0 \ 928 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) 929 930/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro 931 instead. */ 932PyAPI_FUNC(int) 933_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 934 Py_ssize_t length, Py_UCS4 maxchar); 935 936PyAPI_FUNC(int) 937_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str); 938 939PyAPI_FUNC(PyObject *) 940_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); 941 942PyAPI_FUNC(void) 943_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); 944#endif 945 946#ifndef Py_LIMITED_API 947/* Format the object based on the format_spec, as defined in PEP 3101 948 (Advanced String Formatting). */ 949PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter( 950 _PyUnicodeWriter *writer, 951 PyObject *obj, 952 PyObject *format_spec, 953 Py_ssize_t start, 954 Py_ssize_t end); 955#endif 956 957PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 958PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 959PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 960 const char *u /* UTF-8 encoded string */ 961 ); 962#ifndef Py_LIMITED_API 963PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 964#endif 965 966/* Use only if you know it's a string */ 967#define PyUnicode_CHECK_INTERNED(op) \ 968 (((PyASCIIObject *)(op))->state.interned) 969 970/* --- wchar_t support for platforms which support it --------------------- */ 971 972#ifdef HAVE_WCHAR_H 973 974/* Create a Unicode Object from the wchar_t buffer w of the given 975 size. 976 977 The buffer is copied into the new object. */ 978 979PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 980 register const wchar_t *w, /* wchar_t buffer */ 981 Py_ssize_t size /* size of buffer */ 982 ); 983 984/* Copies the Unicode Object contents into the wchar_t buffer w. At 985 most size wchar_t characters are copied. 986 987 Note that the resulting wchar_t string may or may not be 988 0-terminated. It is the responsibility of the caller to make sure 989 that the wchar_t string is 0-terminated in case this is required by 990 the application. 991 992 Returns the number of wchar_t characters copied (excluding a 993 possibly trailing 0-termination character) or -1 in case of an 994 error. */ 995 996PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 997 PyObject *unicode, /* Unicode object */ 998 register wchar_t *w, /* wchar_t buffer */ 999 Py_ssize_t size /* size of buffer */ 1000 ); 1001 1002/* Convert the Unicode object to a wide character string. The output string 1003 always ends with a nul character. If size is not NULL, write the number of 1004 wide characters (excluding the null character) into *size. 1005 1006 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) 1007 on success. On error, returns NULL, *size is undefined and raises a 1008 MemoryError. */ 1009 1010PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 1011 PyObject *unicode, /* Unicode object */ 1012 Py_ssize_t *size /* number of characters of the result */ 1013 ); 1014 1015#ifndef Py_LIMITED_API 1016PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); 1017#endif 1018 1019#endif 1020 1021/* --- Unicode ordinals --------------------------------------------------- */ 1022 1023/* Create a Unicode Object from the given Unicode code point ordinal. 1024 1025 The ordinal must be in range(0x10000) on narrow Python builds 1026 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 1027 raised in case it is not. 1028 1029*/ 1030 1031PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 1032 1033/* --- Free-list management ----------------------------------------------- */ 1034 1035/* Clear the free list used by the Unicode implementation. 1036 1037 This can be used to release memory used for objects on the free 1038 list back to the Python memory allocator. 1039 1040*/ 1041 1042PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 1043 1044/* === Builtin Codecs ===================================================== 1045 1046 Many of these APIs take two arguments encoding and errors. These 1047 parameters encoding and errors have the same semantics as the ones 1048 of the builtin str() API. 1049 1050 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 1051 1052 Error handling is set by errors which may also be set to NULL 1053 meaning to use the default handling defined for the codec. Default 1054 error handling for all builtin codecs is "strict" (ValueErrors are 1055 raised). 1056 1057 The codecs all use a similar interface. Only deviation from the 1058 generic ones are documented. 1059 1060*/ 1061 1062/* --- Manage the default encoding ---------------------------------------- */ 1063 1064/* Returns a pointer to the default encoding (UTF-8) of the 1065 Unicode object unicode and the size of the encoded representation 1066 in bytes stored in *size. 1067 1068 In case of an error, no *size is set. 1069 1070 This function caches the UTF-8 encoded string in the unicodeobject 1071 and subsequent calls will return the same string. The memory is released 1072 when the unicodeobject is deallocated. 1073 1074 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to 1075 support the previous internal function with the same behaviour. 1076 1077 *** This API is for interpreter INTERNAL USE ONLY and will likely 1078 *** be removed or changed in the future. 1079 1080 *** If you need to access the Unicode object as UTF-8 bytes string, 1081 *** please use PyUnicode_AsUTF8String() instead. 1082*/ 1083 1084#ifndef Py_LIMITED_API 1085PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( 1086 PyObject *unicode, 1087 Py_ssize_t *size); 1088#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize 1089#endif 1090 1091/* Returns a pointer to the default encoding (UTF-8) of the 1092 Unicode object unicode. 1093 1094 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 1095 in the unicodeobject. 1096 1097 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 1098 support the previous internal function with the same behaviour. 1099 1100 Use of this API is DEPRECATED since no size information can be 1101 extracted from the returned data. 1102 1103 *** This API is for interpreter INTERNAL USE ONLY and will likely 1104 *** be removed or changed for Python 3.1. 1105 1106 *** If you need to access the Unicode object as UTF-8 bytes string, 1107 *** please use PyUnicode_AsUTF8String() instead. 1108 1109*/ 1110 1111#ifndef Py_LIMITED_API 1112PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); 1113#define _PyUnicode_AsString PyUnicode_AsUTF8 1114#endif 1115 1116/* Returns "utf-8". */ 1117 1118PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 1119 1120/* --- Generic Codecs ----------------------------------------------------- */ 1121 1122/* Create a Unicode object by decoding the encoded string s of the 1123 given size. */ 1124 1125PyAPI_FUNC(PyObject*) PyUnicode_Decode( 1126 const char *s, /* encoded string */ 1127 Py_ssize_t size, /* size of buffer */ 1128 const char *encoding, /* encoding */ 1129 const char *errors /* error handling */ 1130 ); 1131 1132/* Decode a Unicode object unicode and return the result as Python 1133 object. */ 1134 1135PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 1136 PyObject *unicode, /* Unicode object */ 1137 const char *encoding, /* encoding */ 1138 const char *errors /* error handling */ 1139 ); 1140 1141/* Decode a Unicode object unicode and return the result as Unicode 1142 object. */ 1143 1144PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 1145 PyObject *unicode, /* Unicode object */ 1146 const char *encoding, /* encoding */ 1147 const char *errors /* error handling */ 1148 ); 1149 1150/* Encodes a Py_UNICODE buffer of the given size and returns a 1151 Python string object. */ 1152 1153#ifndef Py_LIMITED_API 1154PyAPI_FUNC(PyObject*) PyUnicode_Encode( 1155 const Py_UNICODE *s, /* Unicode char buffer */ 1156 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 1157 const char *encoding, /* encoding */ 1158 const char *errors /* error handling */ 1159 ); 1160#endif 1161 1162/* Encodes a Unicode object and returns the result as Python 1163 object. */ 1164 1165PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 1166 PyObject *unicode, /* Unicode object */ 1167 const char *encoding, /* encoding */ 1168 const char *errors /* error handling */ 1169 ); 1170 1171/* Encodes a Unicode object and returns the result as Python string 1172 object. */ 1173 1174PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 1175 PyObject *unicode, /* Unicode object */ 1176 const char *encoding, /* encoding */ 1177 const char *errors /* error handling */ 1178 ); 1179 1180/* Encodes a Unicode object and returns the result as Unicode 1181 object. */ 1182 1183PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 1184 PyObject *unicode, /* Unicode object */ 1185 const char *encoding, /* encoding */ 1186 const char *errors /* error handling */ 1187 ); 1188 1189/* Build an encoding map. */ 1190 1191PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 1192 PyObject* string /* 256 character map */ 1193 ); 1194 1195/* --- UTF-7 Codecs ------------------------------------------------------- */ 1196 1197PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 1198 const char *string, /* UTF-7 encoded string */ 1199 Py_ssize_t length, /* size of string */ 1200 const char *errors /* error handling */ 1201 ); 1202 1203PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 1204 const char *string, /* UTF-7 encoded string */ 1205 Py_ssize_t length, /* size of string */ 1206 const char *errors, /* error handling */ 1207 Py_ssize_t *consumed /* bytes consumed */ 1208 ); 1209 1210#ifndef Py_LIMITED_API 1211PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 1212 const Py_UNICODE *data, /* Unicode char buffer */ 1213 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1214 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1215 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1216 const char *errors /* error handling */ 1217 ); 1218PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( 1219 PyObject *unicode, /* Unicode object */ 1220 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1221 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1222 const char *errors /* error handling */ 1223 ); 1224#endif 1225 1226/* --- UTF-8 Codecs ------------------------------------------------------- */ 1227 1228PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 1229 const char *string, /* UTF-8 encoded string */ 1230 Py_ssize_t length, /* size of string */ 1231 const char *errors /* error handling */ 1232 ); 1233 1234PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 1235 const char *string, /* UTF-8 encoded string */ 1236 Py_ssize_t length, /* size of string */ 1237 const char *errors, /* error handling */ 1238 Py_ssize_t *consumed /* bytes consumed */ 1239 ); 1240 1241PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 1242 PyObject *unicode /* Unicode object */ 1243 ); 1244 1245#ifndef Py_LIMITED_API 1246PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 1247 PyObject *unicode, 1248 const char *errors); 1249 1250PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 1251 const Py_UNICODE *data, /* Unicode char buffer */ 1252 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1253 const char *errors /* error handling */ 1254 ); 1255#endif 1256 1257/* --- UTF-32 Codecs ------------------------------------------------------ */ 1258 1259/* Decodes length bytes from a UTF-32 encoded buffer string and returns 1260 the corresponding Unicode object. 1261 1262 errors (if non-NULL) defines the error handling. It defaults 1263 to "strict". 1264 1265 If byteorder is non-NULL, the decoder starts decoding using the 1266 given byte order: 1267 1268 *byteorder == -1: little endian 1269 *byteorder == 0: native order 1270 *byteorder == 1: big endian 1271 1272 In native mode, the first four bytes of the stream are checked for a 1273 BOM mark. If found, the BOM mark is analysed, the byte order 1274 adjusted and the BOM skipped. In the other modes, no BOM mark 1275 interpretation is done. After completion, *byteorder is set to the 1276 current byte order at the end of input data. 1277 1278 If byteorder is NULL, the codec starts in native order mode. 1279 1280*/ 1281 1282PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 1283 const char *string, /* UTF-32 encoded string */ 1284 Py_ssize_t length, /* size of string */ 1285 const char *errors, /* error handling */ 1286 int *byteorder /* pointer to byteorder to use 1287 0=native;-1=LE,1=BE; updated on 1288 exit */ 1289 ); 1290 1291PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 1292 const char *string, /* UTF-32 encoded string */ 1293 Py_ssize_t length, /* size of string */ 1294 const char *errors, /* error handling */ 1295 int *byteorder, /* pointer to byteorder to use 1296 0=native;-1=LE,1=BE; updated on 1297 exit */ 1298 Py_ssize_t *consumed /* bytes consumed */ 1299 ); 1300 1301/* Returns a Python string using the UTF-32 encoding in native byte 1302 order. The string always starts with a BOM mark. */ 1303 1304PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 1305 PyObject *unicode /* Unicode object */ 1306 ); 1307 1308/* Returns a Python string object holding the UTF-32 encoded value of 1309 the Unicode data. 1310 1311 If byteorder is not 0, output is written according to the following 1312 byte order: 1313 1314 byteorder == -1: little endian 1315 byteorder == 0: native byte order (writes a BOM mark) 1316 byteorder == 1: big endian 1317 1318 If byteorder is 0, the output string will always start with the 1319 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1320 prepended. 1321 1322*/ 1323 1324#ifndef Py_LIMITED_API 1325PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 1326 const Py_UNICODE *data, /* Unicode char buffer */ 1327 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1328 const char *errors, /* error handling */ 1329 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1330 ); 1331PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( 1332 PyObject *object, /* Unicode object */ 1333 const char *errors, /* error handling */ 1334 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1335 ); 1336#endif 1337 1338/* --- UTF-16 Codecs ------------------------------------------------------ */ 1339 1340/* Decodes length bytes from a UTF-16 encoded buffer string and returns 1341 the corresponding Unicode object. 1342 1343 errors (if non-NULL) defines the error handling. It defaults 1344 to "strict". 1345 1346 If byteorder is non-NULL, the decoder starts decoding using the 1347 given byte order: 1348 1349 *byteorder == -1: little endian 1350 *byteorder == 0: native order 1351 *byteorder == 1: big endian 1352 1353 In native mode, the first two bytes of the stream are checked for a 1354 BOM mark. If found, the BOM mark is analysed, the byte order 1355 adjusted and the BOM skipped. In the other modes, no BOM mark 1356 interpretation is done. After completion, *byteorder is set to the 1357 current byte order at the end of input data. 1358 1359 If byteorder is NULL, the codec starts in native order mode. 1360 1361*/ 1362 1363PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 1364 const char *string, /* UTF-16 encoded string */ 1365 Py_ssize_t length, /* size of string */ 1366 const char *errors, /* error handling */ 1367 int *byteorder /* pointer to byteorder to use 1368 0=native;-1=LE,1=BE; updated on 1369 exit */ 1370 ); 1371 1372PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 1373 const char *string, /* UTF-16 encoded string */ 1374 Py_ssize_t length, /* size of string */ 1375 const char *errors, /* error handling */ 1376 int *byteorder, /* pointer to byteorder to use 1377 0=native;-1=LE,1=BE; updated on 1378 exit */ 1379 Py_ssize_t *consumed /* bytes consumed */ 1380 ); 1381 1382/* Returns a Python string using the UTF-16 encoding in native byte 1383 order. The string always starts with a BOM mark. */ 1384 1385PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 1386 PyObject *unicode /* Unicode object */ 1387 ); 1388 1389/* Returns a Python string object holding the UTF-16 encoded value of 1390 the Unicode data. 1391 1392 If byteorder is not 0, output is written according to the following 1393 byte order: 1394 1395 byteorder == -1: little endian 1396 byteorder == 0: native byte order (writes a BOM mark) 1397 byteorder == 1: big endian 1398 1399 If byteorder is 0, the output string will always start with the 1400 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1401 prepended. 1402 1403 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 1404 UCS-2. This trick makes it possible to add full UTF-16 capabilities 1405 at a later point without compromising the APIs. 1406 1407*/ 1408 1409#ifndef Py_LIMITED_API 1410PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 1411 const Py_UNICODE *data, /* Unicode char buffer */ 1412 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1413 const char *errors, /* error handling */ 1414 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1415 ); 1416PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( 1417 PyObject* unicode, /* Unicode object */ 1418 const char *errors, /* error handling */ 1419 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1420 ); 1421#endif 1422 1423/* --- Unicode-Escape Codecs ---------------------------------------------- */ 1424 1425PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1426 const char *string, /* Unicode-Escape encoded string */ 1427 Py_ssize_t length, /* size of string */ 1428 const char *errors /* error handling */ 1429 ); 1430 1431PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1432 PyObject *unicode /* Unicode object */ 1433 ); 1434 1435#ifndef Py_LIMITED_API 1436PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1437 const Py_UNICODE *data, /* Unicode char buffer */ 1438 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1439 ); 1440#endif 1441 1442/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1443 1444PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1445 const char *string, /* Raw-Unicode-Escape encoded string */ 1446 Py_ssize_t length, /* size of string */ 1447 const char *errors /* error handling */ 1448 ); 1449 1450PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1451 PyObject *unicode /* Unicode object */ 1452 ); 1453 1454#ifndef Py_LIMITED_API 1455PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1456 const Py_UNICODE *data, /* Unicode char buffer */ 1457 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1458 ); 1459#endif 1460 1461/* --- Unicode Internal Codec --------------------------------------------- 1462 1463 Only for internal use in _codecsmodule.c */ 1464 1465#ifndef Py_LIMITED_API 1466PyObject *_PyUnicode_DecodeUnicodeInternal( 1467 const char *string, 1468 Py_ssize_t length, 1469 const char *errors 1470 ); 1471#endif 1472 1473/* --- Latin-1 Codecs ----------------------------------------------------- 1474 1475 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1476 1477*/ 1478 1479PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1480 const char *string, /* Latin-1 encoded string */ 1481 Py_ssize_t length, /* size of string */ 1482 const char *errors /* error handling */ 1483 ); 1484 1485PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1486 PyObject *unicode /* Unicode object */ 1487 ); 1488 1489#ifndef Py_LIMITED_API 1490PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 1491 PyObject* unicode, 1492 const char* errors); 1493 1494PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1495 const Py_UNICODE *data, /* Unicode char buffer */ 1496 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1497 const char *errors /* error handling */ 1498 ); 1499#endif 1500 1501/* --- ASCII Codecs ------------------------------------------------------- 1502 1503 Only 7-bit ASCII data is excepted. All other codes generate errors. 1504 1505*/ 1506 1507PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1508 const char *string, /* ASCII encoded string */ 1509 Py_ssize_t length, /* size of string */ 1510 const char *errors /* error handling */ 1511 ); 1512 1513PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1514 PyObject *unicode /* Unicode object */ 1515 ); 1516 1517#ifndef Py_LIMITED_API 1518PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 1519 PyObject* unicode, 1520 const char* errors); 1521 1522PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1523 const Py_UNICODE *data, /* Unicode char buffer */ 1524 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1525 const char *errors /* error handling */ 1526 ); 1527#endif 1528 1529/* --- Character Map Codecs ----------------------------------------------- 1530 1531 This codec uses mappings to encode and decode characters. 1532 1533 Decoding mappings must map single string characters to single 1534 Unicode characters, integers (which are then interpreted as Unicode 1535 ordinals) or None (meaning "undefined mapping" and causing an 1536 error). 1537 1538 Encoding mappings must map single Unicode characters to single 1539 string characters, integers (which are then interpreted as Latin-1 1540 ordinals) or None (meaning "undefined mapping" and causing an 1541 error). 1542 1543 If a character lookup fails with a LookupError, the character is 1544 copied as-is meaning that its ordinal value will be interpreted as 1545 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1546 to contain those mappings which map characters to different code 1547 points. 1548 1549*/ 1550 1551PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1552 const char *string, /* Encoded string */ 1553 Py_ssize_t length, /* size of string */ 1554 PyObject *mapping, /* character mapping 1555 (char ordinal -> unicode ordinal) */ 1556 const char *errors /* error handling */ 1557 ); 1558 1559PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1560 PyObject *unicode, /* Unicode object */ 1561 PyObject *mapping /* character mapping 1562 (unicode ordinal -> char ordinal) */ 1563 ); 1564 1565#ifndef Py_LIMITED_API 1566PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1567 const Py_UNICODE *data, /* Unicode char buffer */ 1568 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1569 PyObject *mapping, /* character mapping 1570 (unicode ordinal -> char ordinal) */ 1571 const char *errors /* error handling */ 1572 ); 1573PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( 1574 PyObject *unicode, /* Unicode object */ 1575 PyObject *mapping, /* character mapping 1576 (unicode ordinal -> char ordinal) */ 1577 const char *errors /* error handling */ 1578 ); 1579#endif 1580 1581/* Translate a Py_UNICODE buffer of the given length by applying a 1582 character mapping table to it and return the resulting Unicode 1583 object. 1584 1585 The mapping table must map Unicode ordinal integers to Unicode 1586 ordinal integers or None (causing deletion of the character). 1587 1588 Mapping tables may be dictionaries or sequences. Unmapped character 1589 ordinals (ones which cause a LookupError) are left untouched and 1590 are copied as-is. 1591 1592*/ 1593 1594#ifndef Py_LIMITED_API 1595PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1596 const Py_UNICODE *data, /* Unicode char buffer */ 1597 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1598 PyObject *table, /* Translate table */ 1599 const char *errors /* error handling */ 1600 ); 1601#endif 1602 1603#ifdef HAVE_MBCS 1604 1605/* --- MBCS codecs for Windows -------------------------------------------- */ 1606 1607PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1608 const char *string, /* MBCS encoded string */ 1609 Py_ssize_t length, /* size of string */ 1610 const char *errors /* error handling */ 1611 ); 1612 1613PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1614 const char *string, /* MBCS encoded string */ 1615 Py_ssize_t length, /* size of string */ 1616 const char *errors, /* error handling */ 1617 Py_ssize_t *consumed /* bytes consumed */ 1618 ); 1619 1620PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 1621 int code_page, /* code page number */ 1622 const char *string, /* encoded string */ 1623 Py_ssize_t length, /* size of string */ 1624 const char *errors, /* error handling */ 1625 Py_ssize_t *consumed /* bytes consumed */ 1626 ); 1627 1628PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1629 PyObject *unicode /* Unicode object */ 1630 ); 1631 1632#ifndef Py_LIMITED_API 1633PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1634 const Py_UNICODE *data, /* Unicode char buffer */ 1635 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1636 const char *errors /* error handling */ 1637 ); 1638#endif 1639 1640PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 1641 int code_page, /* code page number */ 1642 PyObject *unicode, /* Unicode object */ 1643 const char *errors /* error handling */ 1644 ); 1645 1646#endif /* HAVE_MBCS */ 1647 1648/* --- Decimal Encoder ---------------------------------------------------- */ 1649 1650/* Takes a Unicode string holding a decimal value and writes it into 1651 an output buffer using standard ASCII digit codes. 1652 1653 The output buffer has to provide at least length+1 bytes of storage 1654 area. The output string is 0-terminated. 1655 1656 The encoder converts whitespace to ' ', decimal characters to their 1657 corresponding ASCII digit and all other Latin-1 characters except 1658 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1659 are treated as errors. This includes embedded NULL bytes. 1660 1661 Error handling is defined by the errors argument: 1662 1663 NULL or "strict": raise a ValueError 1664 "ignore": ignore the wrong characters (these are not copied to the 1665 output buffer) 1666 "replace": replaces illegal characters with '?' 1667 1668 Returns 0 on success, -1 on failure. 1669 1670*/ 1671 1672#ifndef Py_LIMITED_API 1673PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1674 Py_UNICODE *s, /* Unicode buffer */ 1675 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1676 char *output, /* Output buffer; must have size >= length */ 1677 const char *errors /* error handling */ 1678 ); 1679#endif 1680 1681/* Transforms code points that have decimal digit property to the 1682 corresponding ASCII digit code points. 1683 1684 Returns a new Unicode string on success, NULL on failure. 1685*/ 1686 1687#ifndef Py_LIMITED_API 1688PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( 1689 Py_UNICODE *s, /* Unicode buffer */ 1690 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ 1691 ); 1692#endif 1693 1694/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject 1695 as argument instead of a raw buffer and length. This function additionally 1696 transforms spaces to ASCII because this is what the callers in longobject, 1697 floatobject, and complexobject did anyways. */ 1698 1699#ifndef Py_LIMITED_API 1700PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 1701 PyObject *unicode /* Unicode object */ 1702 ); 1703#endif 1704 1705/* --- Locale encoding --------------------------------------------------- */ 1706 1707/* Decode a string from the current locale encoding. The decoder is strict if 1708 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 1709 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 1710 be decoded as a surrogate character and *surrogateescape* is not equal to 1711 zero, the byte sequence is escaped using the 'surrogateescape' error handler 1712 instead of being decoded. *str* must end with a null character but cannot 1713 contain embedded null characters. */ 1714 1715PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 1716 const char *str, 1717 Py_ssize_t len, 1718 const char *errors); 1719 1720/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 1721 length using strlen(). */ 1722 1723PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 1724 const char *str, 1725 const char *errors); 1726 1727/* Encode a Unicode object to the current locale encoding. The encoder is 1728 strict is *surrogateescape* is equal to zero, otherwise the 1729 "surrogateescape" error handler is used. Return a bytes object. The string 1730 cannot contain embedded null characters.. */ 1731 1732PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 1733 PyObject *unicode, 1734 const char *errors 1735 ); 1736 1737/* --- File system encoding ---------------------------------------------- */ 1738 1739/* ParseTuple converter: encode str objects to bytes using 1740 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1741 1742PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1743 1744/* ParseTuple converter: decode bytes objects to unicode using 1745 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1746 1747PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1748 1749/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1750 and the "surrogateescape" error handler. 1751 1752 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1753 encoding. 1754 1755 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1756*/ 1757 1758PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1759 const char *s /* encoded string */ 1760 ); 1761 1762/* Decode a string using Py_FileSystemDefaultEncoding 1763 and the "surrogateescape" error handler. 1764 1765 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1766 encoding. 1767*/ 1768 1769PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1770 const char *s, /* encoded string */ 1771 Py_ssize_t size /* size */ 1772 ); 1773 1774/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1775 "surrogateescape" error handler, and return bytes. 1776 1777 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1778 encoding. 1779*/ 1780 1781PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1782 PyObject *unicode 1783 ); 1784 1785/* --- Methods & Slots ---------------------------------------------------- 1786 1787 These are capable of handling Unicode objects and strings on input 1788 (we refer to them as strings in the descriptions) and return 1789 Unicode objects or integers as appropriate. */ 1790 1791/* Concat two strings giving a new Unicode string. */ 1792 1793PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1794 PyObject *left, /* Left string */ 1795 PyObject *right /* Right string */ 1796 ); 1797 1798/* Concat two strings and put the result in *pleft 1799 (sets *pleft to NULL on error) */ 1800 1801PyAPI_FUNC(void) PyUnicode_Append( 1802 PyObject **pleft, /* Pointer to left string */ 1803 PyObject *right /* Right string */ 1804 ); 1805 1806/* Concat two strings, put the result in *pleft and drop the right object 1807 (sets *pleft to NULL on error) */ 1808 1809PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1810 PyObject **pleft, /* Pointer to left string */ 1811 PyObject *right /* Right string */ 1812 ); 1813 1814/* Split a string giving a list of Unicode strings. 1815 1816 If sep is NULL, splitting will be done at all whitespace 1817 substrings. Otherwise, splits occur at the given separator. 1818 1819 At most maxsplit splits will be done. If negative, no limit is set. 1820 1821 Separators are not included in the resulting list. 1822 1823*/ 1824 1825PyAPI_FUNC(PyObject*) PyUnicode_Split( 1826 PyObject *s, /* String to split */ 1827 PyObject *sep, /* String separator */ 1828 Py_ssize_t maxsplit /* Maxsplit count */ 1829 ); 1830 1831/* Dito, but split at line breaks. 1832 1833 CRLF is considered to be one line break. Line breaks are not 1834 included in the resulting list. */ 1835 1836PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1837 PyObject *s, /* String to split */ 1838 int keepends /* If true, line end markers are included */ 1839 ); 1840 1841/* Partition a string using a given separator. */ 1842 1843PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1844 PyObject *s, /* String to partition */ 1845 PyObject *sep /* String separator */ 1846 ); 1847 1848/* Partition a string using a given separator, searching from the end of the 1849 string. */ 1850 1851PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1852 PyObject *s, /* String to partition */ 1853 PyObject *sep /* String separator */ 1854 ); 1855 1856/* Split a string giving a list of Unicode strings. 1857 1858 If sep is NULL, splitting will be done at all whitespace 1859 substrings. Otherwise, splits occur at the given separator. 1860 1861 At most maxsplit splits will be done. But unlike PyUnicode_Split 1862 PyUnicode_RSplit splits from the end of the string. If negative, 1863 no limit is set. 1864 1865 Separators are not included in the resulting list. 1866 1867*/ 1868 1869PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1870 PyObject *s, /* String to split */ 1871 PyObject *sep, /* String separator */ 1872 Py_ssize_t maxsplit /* Maxsplit count */ 1873 ); 1874 1875/* Translate a string by applying a character mapping table to it and 1876 return the resulting Unicode object. 1877 1878 The mapping table must map Unicode ordinal integers to Unicode 1879 ordinal integers or None (causing deletion of the character). 1880 1881 Mapping tables may be dictionaries or sequences. Unmapped character 1882 ordinals (ones which cause a LookupError) are left untouched and 1883 are copied as-is. 1884 1885*/ 1886 1887PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1888 PyObject *str, /* String */ 1889 PyObject *table, /* Translate table */ 1890 const char *errors /* error handling */ 1891 ); 1892 1893/* Join a sequence of strings using the given separator and return 1894 the resulting Unicode string. */ 1895 1896PyAPI_FUNC(PyObject*) PyUnicode_Join( 1897 PyObject *separator, /* Separator string */ 1898 PyObject *seq /* Sequence object */ 1899 ); 1900 1901/* Return 1 if substr matches str[start:end] at the given tail end, 0 1902 otherwise. */ 1903 1904PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1905 PyObject *str, /* String */ 1906 PyObject *substr, /* Prefix or Suffix string */ 1907 Py_ssize_t start, /* Start index */ 1908 Py_ssize_t end, /* Stop index */ 1909 int direction /* Tail end: -1 prefix, +1 suffix */ 1910 ); 1911 1912/* Return the first position of substr in str[start:end] using the 1913 given search direction or -1 if not found. -2 is returned in case 1914 an error occurred and an exception is set. */ 1915 1916PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1917 PyObject *str, /* String */ 1918 PyObject *substr, /* Substring to find */ 1919 Py_ssize_t start, /* Start index */ 1920 Py_ssize_t end, /* Stop index */ 1921 int direction /* Find direction: +1 forward, -1 backward */ 1922 ); 1923 1924/* Like PyUnicode_Find, but search for single character only. */ 1925PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 1926 PyObject *str, 1927 Py_UCS4 ch, 1928 Py_ssize_t start, 1929 Py_ssize_t end, 1930 int direction 1931 ); 1932 1933/* Count the number of occurrences of substr in str[start:end]. */ 1934 1935PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1936 PyObject *str, /* String */ 1937 PyObject *substr, /* Substring to count */ 1938 Py_ssize_t start, /* Start index */ 1939 Py_ssize_t end /* Stop index */ 1940 ); 1941 1942/* Replace at most maxcount occurrences of substr in str with replstr 1943 and return the resulting Unicode object. */ 1944 1945PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1946 PyObject *str, /* String */ 1947 PyObject *substr, /* Substring to find */ 1948 PyObject *replstr, /* Substring to replace */ 1949 Py_ssize_t maxcount /* Max. number of replacements to apply; 1950 -1 = all */ 1951 ); 1952 1953/* Compare two strings and return -1, 0, 1 for less than, equal, 1954 greater than resp. 1955 Raise an exception and return -1 on error. */ 1956 1957PyAPI_FUNC(int) PyUnicode_Compare( 1958 PyObject *left, /* Left string */ 1959 PyObject *right /* Right string */ 1960 ); 1961 1962PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1963 PyObject *left, 1964 const char *right /* ASCII-encoded string */ 1965 ); 1966 1967/* Rich compare two strings and return one of the following: 1968 1969 - NULL in case an exception was raised 1970 - Py_True or Py_False for successfully comparisons 1971 - Py_NotImplemented in case the type combination is unknown 1972 1973 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1974 case the conversion of the arguments to Unicode fails with a 1975 UnicodeDecodeError. 1976 1977 Possible values for op: 1978 1979 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1980 1981*/ 1982 1983PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1984 PyObject *left, /* Left string */ 1985 PyObject *right, /* Right string */ 1986 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1987 ); 1988 1989/* Apply a argument tuple or dictionary to a format string and return 1990 the resulting Unicode string. */ 1991 1992PyAPI_FUNC(PyObject *) PyUnicode_Format( 1993 PyObject *format, /* Format string */ 1994 PyObject *args /* Argument tuple or dictionary */ 1995 ); 1996 1997/* Checks whether element is contained in container and return 1/0 1998 accordingly. 1999 2000 element has to coerce to an one element Unicode string. -1 is 2001 returned in case of an error. */ 2002 2003PyAPI_FUNC(int) PyUnicode_Contains( 2004 PyObject *container, /* Container string */ 2005 PyObject *element /* Element string */ 2006 ); 2007 2008/* Checks whether the string contains any NUL characters. */ 2009 2010#ifndef Py_LIMITED_API 2011PyAPI_FUNC(int) _PyUnicode_HasNULChars(PyObject *); 2012#endif 2013 2014/* Checks whether argument is a valid identifier. */ 2015 2016PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 2017 2018#ifndef Py_LIMITED_API 2019/* Externally visible for str.strip(unicode) */ 2020PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 2021 PyObject *self, 2022 int striptype, 2023 PyObject *sepobj 2024 ); 2025#endif 2026 2027/* Using explicit passed-in values, insert the thousands grouping 2028 into the string pointed to by buffer. For the argument descriptions, 2029 see Objects/stringlib/localeutil.h */ 2030#ifndef Py_LIMITED_API 2031PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 2032 PyObject *unicode, 2033 Py_ssize_t index, 2034 Py_ssize_t n_buffer, 2035 void *digits, 2036 Py_ssize_t n_digits, 2037 Py_ssize_t min_width, 2038 const char *grouping, 2039 PyObject *thousands_sep, 2040 Py_UCS4 *maxchar); 2041#endif 2042/* === Characters Type APIs =============================================== */ 2043 2044/* Helper array used by Py_UNICODE_ISSPACE(). */ 2045 2046#ifndef Py_LIMITED_API 2047PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 2048 2049/* These should not be used directly. Use the Py_UNICODE_IS* and 2050 Py_UNICODE_TO* macros instead. 2051 2052 These APIs are implemented in Objects/unicodectype.c. 2053 2054*/ 2055 2056PyAPI_FUNC(int) _PyUnicode_IsLowercase( 2057 Py_UCS4 ch /* Unicode character */ 2058 ); 2059 2060PyAPI_FUNC(int) _PyUnicode_IsUppercase( 2061 Py_UCS4 ch /* Unicode character */ 2062 ); 2063 2064PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 2065 Py_UCS4 ch /* Unicode character */ 2066 ); 2067 2068PyAPI_FUNC(int) _PyUnicode_IsXidStart( 2069 Py_UCS4 ch /* Unicode character */ 2070 ); 2071 2072PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 2073 Py_UCS4 ch /* Unicode character */ 2074 ); 2075 2076PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 2077 const Py_UCS4 ch /* Unicode character */ 2078 ); 2079 2080PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 2081 const Py_UCS4 ch /* Unicode character */ 2082 ); 2083 2084PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 2085 Py_UCS4 ch /* Unicode character */ 2086 ); 2087 2088PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 2089 Py_UCS4 ch /* Unicode character */ 2090 ); 2091 2092PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 2093 Py_UCS4 ch /* Unicode character */ 2094 ); 2095 2096PyAPI_FUNC(int) _PyUnicode_ToLowerFull( 2097 Py_UCS4 ch, /* Unicode character */ 2098 Py_UCS4 *res 2099 ); 2100 2101PyAPI_FUNC(int) _PyUnicode_ToTitleFull( 2102 Py_UCS4 ch, /* Unicode character */ 2103 Py_UCS4 *res 2104 ); 2105 2106PyAPI_FUNC(int) _PyUnicode_ToUpperFull( 2107 Py_UCS4 ch, /* Unicode character */ 2108 Py_UCS4 *res 2109 ); 2110 2111PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( 2112 Py_UCS4 ch, /* Unicode character */ 2113 Py_UCS4 *res 2114 ); 2115 2116PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( 2117 Py_UCS4 ch /* Unicode character */ 2118 ); 2119 2120PyAPI_FUNC(int) _PyUnicode_IsCased( 2121 Py_UCS4 ch /* Unicode character */ 2122 ); 2123 2124PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 2125 Py_UCS4 ch /* Unicode character */ 2126 ); 2127 2128PyAPI_FUNC(int) _PyUnicode_ToDigit( 2129 Py_UCS4 ch /* Unicode character */ 2130 ); 2131 2132PyAPI_FUNC(double) _PyUnicode_ToNumeric( 2133 Py_UCS4 ch /* Unicode character */ 2134 ); 2135 2136PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 2137 Py_UCS4 ch /* Unicode character */ 2138 ); 2139 2140PyAPI_FUNC(int) _PyUnicode_IsDigit( 2141 Py_UCS4 ch /* Unicode character */ 2142 ); 2143 2144PyAPI_FUNC(int) _PyUnicode_IsNumeric( 2145 Py_UCS4 ch /* Unicode character */ 2146 ); 2147 2148PyAPI_FUNC(int) _PyUnicode_IsPrintable( 2149 Py_UCS4 ch /* Unicode character */ 2150 ); 2151 2152PyAPI_FUNC(int) _PyUnicode_IsAlpha( 2153 Py_UCS4 ch /* Unicode character */ 2154 ); 2155 2156PyAPI_FUNC(size_t) Py_UNICODE_strlen( 2157 const Py_UNICODE *u 2158 ); 2159 2160PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 2161 Py_UNICODE *s1, 2162 const Py_UNICODE *s2); 2163 2164PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 2165 Py_UNICODE *s1, const Py_UNICODE *s2); 2166 2167PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 2168 Py_UNICODE *s1, 2169 const Py_UNICODE *s2, 2170 size_t n); 2171 2172PyAPI_FUNC(int) Py_UNICODE_strcmp( 2173 const Py_UNICODE *s1, 2174 const Py_UNICODE *s2 2175 ); 2176 2177PyAPI_FUNC(int) Py_UNICODE_strncmp( 2178 const Py_UNICODE *s1, 2179 const Py_UNICODE *s2, 2180 size_t n 2181 ); 2182 2183PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 2184 const Py_UNICODE *s, 2185 Py_UNICODE c 2186 ); 2187 2188PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 2189 const Py_UNICODE *s, 2190 Py_UNICODE c 2191 ); 2192 2193/* Create a copy of a unicode string ending with a nul character. Return NULL 2194 and raise a MemoryError exception on memory allocation failure, otherwise 2195 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 2196 2197PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 2198 PyObject *unicode 2199 ); 2200#endif /* Py_LIMITED_API */ 2201 2202#if defined(Py_DEBUG) && !defined(Py_LIMITED_API) 2203PyAPI_FUNC(int) _PyUnicode_CheckConsistency( 2204 PyObject *op, 2205 int check_content); 2206#endif 2207 2208/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ 2209PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); 2210/* Clear all static strings. */ 2211PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); 2212 2213#ifdef __cplusplus 2214} 2215#endif 2216#endif /* !Py_UNICODEOBJECT_H */ 2217