unicodeobject.h revision 080a2c087e5fa08c44ff121d74ea8ad9d4413c58
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal. (See 11http://www.egenix.com/files/python/unicode-proposal.txt). 12 13Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58#include <ctype.h> 59 60/* === Internal API ======================================================= */ 61 62/* --- Internal Unicode Format -------------------------------------------- */ 63 64/* Python 3.x requires unicode */ 65#define Py_USING_UNICODE 66 67#ifndef SIZEOF_WCHAR_T 68#error Must define SIZEOF_WCHAR_T 69#endif 70 71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Py_UNICODE was the native Unicode storage format (code unit) used by 87 Python and represents a single Unicode element in the Unicode type. 88 With PEP 393, Py_UNICODE is deprecated and replaced with a 89 typedef to wchar_t. */ 90 91#ifndef Py_LIMITED_API 92#define PY_UNICODE_TYPE wchar_t 93typedef wchar_t Py_UNICODE; 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar(), 98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#if defined(MS_WINDOWS) 107# define HAVE_MBCS 108#endif 109 110#ifdef HAVE_WCHAR_H 111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 112# ifdef _HAVE_BSDI 113# include <time.h> 114# endif 115# include <wchar.h> 116#endif 117 118/* Py_UCS4 and Py_UCS2 are typedefs for the respective 119 unicode representations. */ 120#if SIZEOF_INT == 4 121typedef unsigned int Py_UCS4; 122#elif SIZEOF_LONG == 4 123typedef unsigned long Py_UCS4; 124#else 125#error "Could not find a proper typedef for Py_UCS4" 126#endif 127 128#if SIZEOF_SHORT == 2 129typedef unsigned short Py_UCS2; 130#else 131#error "Could not find a proper typedef for Py_UCS2" 132#endif 133 134typedef unsigned char Py_UCS1; 135 136/* --- Internal Unicode Operations ---------------------------------------- */ 137 138/* Since splitting on whitespace is an important use case, and 139 whitespace in most situations is solely ASCII whitespace, we 140 optimize for the common case by using a quick look-up table 141 _Py_ascii_whitespace (see below) with an inlined check. 142 143 */ 144#ifndef Py_LIMITED_API 145#define Py_UNICODE_ISSPACE(ch) \ 146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 147 148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 152 153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 156 157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 161 162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 165 166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 167 168#define Py_UNICODE_ISALNUM(ch) \ 169 (Py_UNICODE_ISALPHA(ch) || \ 170 Py_UNICODE_ISDECIMAL(ch) || \ 171 Py_UNICODE_ISDIGIT(ch) || \ 172 Py_UNICODE_ISNUMERIC(ch)) 173 174#define Py_UNICODE_COPY(target, source, length) \ 175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 176 177#define Py_UNICODE_FILL(target, value, length) \ 178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 180 } while (0) 181 182/* macros to work with surrogates */ 183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF) 184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 186/* Join two surrogate characters and return a single Py_UCS4 value. */ 187#define Py_UNICODE_JOIN_SURROGATES(high, low) \ 188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 190/* high surrogate = top 10 bits added to D800 */ 191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) 192/* low surrogate = bottom 10 bits added to DC00 */ 193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) 194 195/* Check if substring matches at given offset. The offset must be 196 valid, and the substring must not be empty. */ 197 198#define Py_UNICODE_MATCH(string, offset, substring) \ 199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ 200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ 201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) 202 203#endif /* Py_LIMITED_API */ 204 205#ifdef __cplusplus 206extern "C" { 207#endif 208 209/* --- Unicode Type ------------------------------------------------------- */ 210 211#ifndef Py_LIMITED_API 212 213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 214 structure. state.ascii and state.compact are set, and the data 215 immediately follow the structure. utf8_length and wstr_length can be found 216 in the length field; the utf8 pointer is equal to the data pointer. */ 217typedef struct { 218 /* There are 4 forms of Unicode strings: 219 220 - compact ascii: 221 222 * structure = PyASCIIObject 223 * test: PyUnicode_IS_COMPACT_ASCII(op) 224 * kind = PyUnicode_1BYTE_KIND 225 * compact = 1 226 * ascii = 1 227 * ready = 1 228 * (length is the length of the utf8 and wstr strings) 229 * (data starts just after the structure) 230 * (since ASCII is decoded from UTF-8, the utf8 string are the data) 231 232 - compact: 233 234 * structure = PyCompactUnicodeObject 235 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) 236 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 237 PyUnicode_4BYTE_KIND 238 * compact = 1 239 * ready = 1 240 * ascii = 0 241 * utf8 is not shared with data 242 * utf8_length = 0 if utf8 is NULL 243 * wstr is shared with data and wstr_length=length 244 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 245 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 246 * wstr_length = 0 if wstr is NULL 247 * (data starts just after the structure) 248 249 - legacy string, not ready: 250 251 * structure = PyUnicodeObject 252 * test: kind == PyUnicode_WCHAR_KIND 253 * length = 0 (use wstr_length) 254 * hash = -1 255 * kind = PyUnicode_WCHAR_KIND 256 * compact = 0 257 * ascii = 0 258 * ready = 0 259 * interned = SSTATE_NOT_INTERNED 260 * wstr is not NULL 261 * data.any is NULL 262 * utf8 is NULL 263 * utf8_length = 0 264 265 - legacy string, ready: 266 267 * structure = PyUnicodeObject structure 268 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND 269 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 270 PyUnicode_4BYTE_KIND 271 * compact = 0 272 * ready = 1 273 * data.any is not NULL 274 * utf8 is shared and utf8_length = length with data.any if ascii = 1 275 * utf8_length = 0 if utf8 is NULL 276 * wstr is shared with data.any and wstr_length = length 277 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 278 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 279 * wstr_length = 0 if wstr is NULL 280 281 Compact strings use only one memory block (structure + characters), 282 whereas legacy strings use one block for the structure and one block 283 for characters. 284 285 Legacy strings are created by PyUnicode_FromUnicode() and 286 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready 287 when PyUnicode_READY() is called. 288 289 See also _PyUnicode_CheckConsistency(). 290 */ 291 PyObject_HEAD 292 Py_ssize_t length; /* Number of code points in the string */ 293 Py_hash_t hash; /* Hash value; -1 if not set */ 294 struct { 295 /* 296 SSTATE_NOT_INTERNED (0) 297 SSTATE_INTERNED_MORTAL (1) 298 SSTATE_INTERNED_IMMORTAL (2) 299 300 If interned != SSTATE_NOT_INTERNED, the two references from the 301 dictionary to this object are *not* counted in ob_refcnt. 302 */ 303 unsigned int interned:2; 304 /* Character size: 305 306 - PyUnicode_WCHAR_KIND (0): 307 308 * character type = wchar_t (16 or 32 bits, depending on the 309 platform) 310 311 - PyUnicode_1BYTE_KIND (1): 312 313 * character type = Py_UCS1 (8 bits, unsigned) 314 * all characters are in the range U+0000-U+00FF (latin1) 315 * if ascii is set, all characters are in the range U+0000-U+007F 316 (ASCII), otherwise at least one character is in the range 317 U+0080-U+00FF 318 319 - PyUnicode_2BYTE_KIND (2): 320 321 * character type = Py_UCS2 (16 bits, unsigned) 322 * all characters are in the range U+0000-U+FFFF (BMP) 323 * at least one character is in the range U+0100-U+FFFF 324 325 - PyUnicode_4BYTE_KIND (4): 326 327 * character type = Py_UCS4 (32 bits, unsigned) 328 * all characters are in the range U+0000-U+10FFFF 329 * at least one character is in the range U+10000-U+10FFFF 330 */ 331 unsigned int kind:3; 332 /* Compact is with respect to the allocation scheme. Compact unicode 333 objects only require one memory block while non-compact objects use 334 one block for the PyUnicodeObject struct and another for its data 335 buffer. */ 336 unsigned int compact:1; 337 /* The string only contains characters in the range U+0000-U+007F (ASCII) 338 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is 339 set, use the PyASCIIObject structure. */ 340 unsigned int ascii:1; 341 /* The ready flag indicates whether the object layout is initialized 342 completely. This means that this is either a compact object, or 343 the data pointer is filled out. The bit is redundant, and helps 344 to minimize the test in PyUnicode_IS_READY(). */ 345 unsigned int ready:1; 346 } state; 347 wchar_t *wstr; /* wchar_t representation (null-terminated) */ 348} PyASCIIObject; 349 350/* Non-ASCII strings allocated through PyUnicode_New use the 351 PyCompactUnicodeObject structure. state.compact is set, and the data 352 immediately follow the structure. */ 353typedef struct { 354 PyASCIIObject _base; 355 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 356 * terminating \0. */ 357 char *utf8; /* UTF-8 representation (null-terminated) */ 358 Py_ssize_t wstr_length; /* Number of code points in wstr, possible 359 * surrogates count as two code points. */ 360} PyCompactUnicodeObject; 361 362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the 363 PyUnicodeObject structure. The actual string data is initially in the wstr 364 block, and copied into the data block using _PyUnicode_Ready. */ 365typedef struct { 366 PyCompactUnicodeObject _base; 367 union { 368 void *any; 369 Py_UCS1 *latin1; 370 Py_UCS2 *ucs2; 371 Py_UCS4 *ucs4; 372 } data; /* Canonical, smallest-form Unicode buffer */ 373} PyUnicodeObject; 374#endif 375 376PyAPI_DATA(PyTypeObject) PyUnicode_Type; 377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 378 379#define PyUnicode_Check(op) \ 380 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 382 383/* Fast access macros */ 384#ifndef Py_LIMITED_API 385 386#define PyUnicode_WSTR_LENGTH(op) \ 387 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 388 ((PyASCIIObject*)op)->length : \ 389 ((PyCompactUnicodeObject*)op)->wstr_length) 390 391/* Returns the deprecated Py_UNICODE representation's size in code units 392 (this includes surrogate pairs as 2 units). 393 If the Py_UNICODE representation is not available, it will be computed 394 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ 395 396#define PyUnicode_GET_SIZE(op) \ 397 (assert(PyUnicode_Check(op)), \ 398 (((PyASCIIObject *)(op))->wstr) ? \ 399 PyUnicode_WSTR_LENGTH(op) : \ 400 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ 401 assert(((PyASCIIObject *)(op))->wstr), \ 402 PyUnicode_WSTR_LENGTH(op))) 403 404#define PyUnicode_GET_DATA_SIZE(op) \ 405 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) 406 407/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE 408 representation on demand. Using this macro is very inefficient now, 409 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or 410 use PyUnicode_WRITE() and PyUnicode_READ(). */ 411 412#define PyUnicode_AS_UNICODE(op) \ 413 (assert(PyUnicode_Check(op)), \ 414 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ 415 PyUnicode_AsUnicode((PyObject *)(op))) 416 417#define PyUnicode_AS_DATA(op) \ 418 ((const char *)(PyUnicode_AS_UNICODE(op))) 419 420 421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ 422 423/* Values for PyASCIIObject.state: */ 424 425/* Interning state. */ 426#define SSTATE_NOT_INTERNED 0 427#define SSTATE_INTERNED_MORTAL 1 428#define SSTATE_INTERNED_IMMORTAL 2 429 430/* Return true if the string contains only ASCII characters, or 0 if not. The 431 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be 432 ready. */ 433#define PyUnicode_IS_ASCII(op) \ 434 (assert(PyUnicode_Check(op)), \ 435 assert(PyUnicode_IS_READY(op)), \ 436 ((PyASCIIObject*)op)->state.ascii) 437 438/* Return true if the string is compact or 0 if not. 439 No type checks or Ready calls are performed. */ 440#define PyUnicode_IS_COMPACT(op) \ 441 (((PyASCIIObject*)(op))->state.compact) 442 443/* Return true if the string is a compact ASCII string (use PyASCIIObject 444 structure), or 0 if not. No type checks or Ready calls are performed. */ 445#define PyUnicode_IS_COMPACT_ASCII(op) \ 446 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op)) 447 448enum PyUnicode_Kind { 449/* String contains only wstr byte characters. This is only possible 450 when the string was created with a legacy API and _PyUnicode_Ready() 451 has not been called yet. */ 452 PyUnicode_WCHAR_KIND = 0, 453/* Return values of the PyUnicode_KIND() macro: */ 454 PyUnicode_1BYTE_KIND = 1, 455 PyUnicode_2BYTE_KIND = 2, 456 PyUnicode_4BYTE_KIND = 4 457}; 458 459/* Return pointers to the canonical representation cast to unsigned char, 460 Py_UCS2, or Py_UCS4 for direct character access. 461 No checks are performed, use PyUnicode_KIND() before to ensure 462 these will work correctly. */ 463 464#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) 465#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) 466#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) 467 468/* Return one of the PyUnicode_*_KIND values defined above. */ 469#define PyUnicode_KIND(op) \ 470 (assert(PyUnicode_Check(op)), \ 471 assert(PyUnicode_IS_READY(op)), \ 472 ((PyASCIIObject *)(op))->state.kind) 473 474/* Return a void pointer to the raw unicode buffer. */ 475#define _PyUnicode_COMPACT_DATA(op) \ 476 (PyUnicode_IS_ASCII(op) ? \ 477 ((void*)((PyASCIIObject*)(op) + 1)) : \ 478 ((void*)((PyCompactUnicodeObject*)(op) + 1))) 479 480#define _PyUnicode_NONCOMPACT_DATA(op) \ 481 (assert(((PyUnicodeObject*)(op))->data.any), \ 482 ((((PyUnicodeObject *)(op))->data.any))) 483 484#define PyUnicode_DATA(op) \ 485 (assert(PyUnicode_Check(op)), \ 486 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ 487 _PyUnicode_NONCOMPACT_DATA(op)) 488 489/* In the access macros below, "kind" may be evaluated more than once. 490 All other macro parameters are evaluated exactly once, so it is safe 491 to put side effects into them (such as increasing the index). */ 492 493/* Write into the canonical representation, this macro does not do any sanity 494 checks and is intended for usage in loops. The caller should cache the 495 kind and data pointers obtained from other macro calls. 496 index is the index in the string (starts at 0) and value is the new 497 code point value which should be written to that location. */ 498#define PyUnicode_WRITE(kind, data, index, value) \ 499 do { \ 500 switch ((kind)) { \ 501 case PyUnicode_1BYTE_KIND: { \ 502 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ 503 break; \ 504 } \ 505 case PyUnicode_2BYTE_KIND: { \ 506 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ 507 break; \ 508 } \ 509 default: { \ 510 assert((kind) == PyUnicode_4BYTE_KIND); \ 511 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ 512 } \ 513 } \ 514 } while (0) 515 516/* Read a code point from the string's canonical representation. No checks 517 or ready calls are performed. */ 518#define PyUnicode_READ(kind, data, index) \ 519 ((Py_UCS4) \ 520 ((kind) == PyUnicode_1BYTE_KIND ? \ 521 ((const Py_UCS1 *)(data))[(index)] : \ 522 ((kind) == PyUnicode_2BYTE_KIND ? \ 523 ((const Py_UCS2 *)(data))[(index)] : \ 524 ((const Py_UCS4 *)(data))[(index)] \ 525 ) \ 526 )) 527 528/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 529 calls PyUnicode_KIND() and might call it twice. For single reads, use 530 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 531 cache kind and use PyUnicode_READ instead. */ 532#define PyUnicode_READ_CHAR(unicode, index) \ 533 (assert(PyUnicode_Check(unicode)), \ 534 assert(PyUnicode_IS_READY(unicode)), \ 535 (Py_UCS4) \ 536 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ 537 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ 538 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ 539 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ 540 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ 541 ) \ 542 )) 543 544/* Returns the length of the unicode string. The caller has to make sure that 545 the string has it's canonical representation set before calling 546 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ 547#define PyUnicode_GET_LENGTH(op) \ 548 (assert(PyUnicode_Check(op)), \ 549 assert(PyUnicode_IS_READY(op)), \ 550 ((PyASCIIObject *)(op))->length) 551 552 553/* Fast check to determine whether an object is ready. Equivalent to 554 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ 555 556#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) 557 558/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best 559 case. If the canonical representation is not yet set, it will still call 560 _PyUnicode_Ready(). 561 Returns 0 on success and -1 on errors. */ 562#define PyUnicode_READY(op) \ 563 (assert(PyUnicode_Check(op)), \ 564 (PyUnicode_IS_READY(op) ? \ 565 0 : _PyUnicode_Ready((PyObject *)(op)))) 566 567/* Return a maximum character value which is suitable for creating another 568 string based on op. This is always an approximation but more efficient 569 than iterating over the string. */ 570#define PyUnicode_MAX_CHAR_VALUE(op) \ 571 (assert(PyUnicode_IS_READY(op)), \ 572 (PyUnicode_IS_ASCII(op) ? \ 573 (0x7f) : \ 574 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ 575 (0xffU) : \ 576 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ 577 (0xffffU) : \ 578 (0x10ffffU))))) 579 580#endif 581 582/* --- Constants ---------------------------------------------------------- */ 583 584/* This Unicode character will be used as replacement character during 585 decoding if the errors argument is set to "replace". Note: the 586 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 587 Unicode 3.0. */ 588 589#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 590 591/* === Public API ========================================================= */ 592 593/* --- Plain Py_UNICODE --------------------------------------------------- */ 594 595/* With PEP 393, this is the recommended way to allocate a new unicode object. 596 This function will allocate the object and its buffer in a single memory 597 block. Objects created using this function are not resizable. */ 598#ifndef Py_LIMITED_API 599PyAPI_FUNC(PyObject*) PyUnicode_New( 600 Py_ssize_t size, /* Number of code points in the new string */ 601 Py_UCS4 maxchar /* maximum code point value in the string */ 602 ); 603#endif 604 605/* Initializes the canonical string representation from a the deprecated 606 wstr/Py_UNICODE representation. This function is used to convert Unicode 607 objects which were created using the old API to the new flexible format 608 introduced with PEP 393. 609 610 Don't call this function directly, use the public PyUnicode_READY() macro 611 instead. */ 612#ifndef Py_LIMITED_API 613PyAPI_FUNC(int) _PyUnicode_Ready( 614 PyObject *unicode /* Unicode object */ 615 ); 616#endif 617 618/* Get a copy of a Unicode string. */ 619#ifndef Py_LIMITED_API 620PyAPI_FUNC(PyObject*) _PyUnicode_Copy( 621 PyObject *unicode 622 ); 623#endif 624 625/* Copy character from one unicode object into another, this function performs 626 character conversion when necessary and falls back to memcpy() if possible. 627 628 Fail if to is too small (smaller than *how_many* or smaller than 629 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 630 kind(to), or if *to* has more than 1 reference. 631 632 Return the number of written character, or return -1 and raise an exception 633 on error. 634 635 Pseudo-code: 636 637 how_many = min(how_many, len(from) - from_start) 638 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 639 return how_many 640 641 Note: The function doesn't write a terminating null character. 642 */ 643#ifndef Py_LIMITED_API 644PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 645 PyObject *to, 646 Py_ssize_t to_start, 647 PyObject *from, 648 Py_ssize_t from_start, 649 Py_ssize_t how_many 650 ); 651 652/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so 653 may crash if parameters are invalid (e.g. if the output string 654 is too short). */ 655PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters( 656 PyObject *to, 657 Py_ssize_t to_start, 658 PyObject *from, 659 Py_ssize_t from_start, 660 Py_ssize_t how_many 661 ); 662#endif 663 664#ifndef Py_LIMITED_API 665/* Fill a string with a character: write fill_char into 666 unicode[start:start+length]. 667 668 Fail if fill_char is bigger than the string maximum character, or if the 669 string has more than 1 reference. 670 671 Return the number of written character, or return -1 and raise an exception 672 on error. */ 673PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( 674 PyObject *unicode, 675 Py_ssize_t start, 676 Py_ssize_t length, 677 Py_UCS4 fill_char 678 ); 679 680/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash 681 if parameters are invalid (e.g. if length is longer than the string). */ 682PyAPI_FUNC(void) _PyUnicode_FastFill( 683 PyObject *unicode, 684 Py_ssize_t start, 685 Py_ssize_t length, 686 Py_UCS4 fill_char 687 ); 688#endif 689 690/* Create a Unicode Object from the Py_UNICODE buffer u of the given 691 size. 692 693 u may be NULL which causes the contents to be undefined. It is the 694 user's responsibility to fill in the needed data afterwards. Note 695 that modifying the Unicode object contents after construction is 696 only allowed if u was set to NULL. 697 698 The buffer is copied into the new object. */ 699 700#ifndef Py_LIMITED_API 701PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 702 const Py_UNICODE *u, /* Unicode buffer */ 703 Py_ssize_t size /* size of buffer */ 704 ); 705#endif 706 707/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 708PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 709 const char *u, /* UTF-8 encoded string */ 710 Py_ssize_t size /* size of buffer */ 711 ); 712 713/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 714 UTF-8 encoded bytes. The size is determined with strlen(). */ 715PyAPI_FUNC(PyObject*) PyUnicode_FromString( 716 const char *u /* UTF-8 encoded string */ 717 ); 718 719#ifndef Py_LIMITED_API 720/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. 721 Scan the string to find the maximum character. */ 722PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 723 int kind, 724 const void *buffer, 725 Py_ssize_t size); 726 727/* Create a new string from a buffer of ASCII characters. 728 WARNING: Don't check if the string contains any non-ASCII character. */ 729PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII( 730 const char *buffer, 731 Py_ssize_t size); 732#endif 733 734PyAPI_FUNC(PyObject*) PyUnicode_Substring( 735 PyObject *str, 736 Py_ssize_t start, 737 Py_ssize_t end); 738 739#ifndef Py_LIMITED_API 740/* Compute the maximum character of the substring unicode[start:end]. 741 Return 127 for an empty string. */ 742PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( 743 PyObject *unicode, 744 Py_ssize_t start, 745 Py_ssize_t end); 746#endif 747 748/* Copy the string into a UCS4 buffer including the null character if copy_null 749 is set. Return NULL and raise an exception on error. Raise a ValueError if 750 the buffer is smaller than the string. Return buffer on success. 751 752 buflen is the length of the buffer in (Py_UCS4) characters. */ 753PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 754 PyObject *unicode, 755 Py_UCS4* buffer, 756 Py_ssize_t buflen, 757 int copy_null); 758 759/* Copy the string into a UCS4 buffer. A new buffer is allocated using 760 * PyMem_Malloc; if this fails, NULL is returned with a memory error 761 exception set. */ 762PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 763 764/* Return a read-only pointer to the Unicode object's internal 765 Py_UNICODE buffer. 766 If the wchar_t/Py_UNICODE representation is not yet available, this 767 function will calculate it. */ 768 769#ifndef Py_LIMITED_API 770PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 771 PyObject *unicode /* Unicode object */ 772 ); 773#endif 774 775/* Return a read-only pointer to the Unicode object's internal 776 Py_UNICODE buffer and save the length at size. 777 If the wchar_t/Py_UNICODE representation is not yet available, this 778 function will calculate it. */ 779 780#ifndef Py_LIMITED_API 781PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( 782 PyObject *unicode, /* Unicode object */ 783 Py_ssize_t *size /* location where to save the length */ 784 ); 785#endif 786 787/* Get the length of the Unicode object. */ 788 789PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 790 PyObject *unicode 791); 792 793/* Get the number of Py_UNICODE units in the 794 string representation. */ 795 796PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 797 PyObject *unicode /* Unicode object */ 798 ); 799 800/* Read a character from the string. */ 801 802PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 803 PyObject *unicode, 804 Py_ssize_t index 805 ); 806 807/* Write a character to the string. The string must have been created through 808 PyUnicode_New, must not be shared, and must not have been hashed yet. 809 810 Return 0 on success, -1 on error. */ 811 812PyAPI_FUNC(int) PyUnicode_WriteChar( 813 PyObject *unicode, 814 Py_ssize_t index, 815 Py_UCS4 character 816 ); 817 818#ifndef Py_LIMITED_API 819/* Get the maximum ordinal for a Unicode character. */ 820PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 821#endif 822 823/* Resize an Unicode object. The length is the number of characters, except 824 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 825 is the number of Py_UNICODE characters. 826 827 *unicode is modified to point to the new (resized) object and 0 828 returned on success. 829 830 Try to resize the string in place (which is usually faster than allocating 831 a new string and copy characters), or create a new string. 832 833 Error handling is implemented as follows: an exception is set, -1 834 is returned and *unicode left untouched. 835 836 WARNING: The function doesn't check string content, the result may not be a 837 string in canonical representation. */ 838 839PyAPI_FUNC(int) PyUnicode_Resize( 840 PyObject **unicode, /* Pointer to the Unicode object */ 841 Py_ssize_t length /* New length */ 842 ); 843 844/* Coerce obj to an Unicode object and return a reference with 845 *incremented* refcount. 846 847 Coercion is done in the following way: 848 849 1. bytes, bytearray and other char buffer compatible objects are decoded 850 under the assumptions that they contain data using the UTF-8 851 encoding. Decoding is done in "strict" mode. 852 853 2. All other objects (including Unicode objects) raise an 854 exception. 855 856 The API returns NULL in case of an error. The caller is responsible 857 for decref'ing the returned objects. 858 859*/ 860 861PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 862 register PyObject *obj, /* Object */ 863 const char *encoding, /* encoding */ 864 const char *errors /* error handling */ 865 ); 866 867/* Coerce obj to an Unicode object and return a reference with 868 *incremented* refcount. 869 870 Unicode objects are passed back as-is (subclasses are converted to 871 true Unicode objects), all other objects are delegated to 872 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 873 using UTF-8 encoding as basis for decoding the object. 874 875 The API returns NULL in case of an error. The caller is responsible 876 for decref'ing the returned objects. 877 878*/ 879 880PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 881 register PyObject *obj /* Object */ 882 ); 883 884PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 885 const char *format, /* ASCII-encoded string */ 886 va_list vargs 887 ); 888PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 889 const char *format, /* ASCII-encoded string */ 890 ... 891 ); 892 893#ifndef Py_LIMITED_API 894typedef struct { 895 PyObject *buffer; 896 void *data; 897 enum PyUnicode_Kind kind; 898 Py_UCS4 maxchar; 899 Py_ssize_t size; 900 Py_ssize_t pos; 901 /* minimum length of the buffer when overallocation is enabled, 902 see _PyUnicodeWriter_Init() */ 903 Py_ssize_t min_length; 904 unsigned char overallocate; 905 /* If readonly is 1, buffer is a shared string (cannot be modified) 906 and size is set to 0. */ 907 unsigned char readonly; 908} _PyUnicodeWriter ; 909 910/* Initialize a Unicode writer. 911 912 If min_length is greater than zero, _PyUnicodeWriter_Prepare() 913 overallocates the buffer and min_length is the minimum length in characters 914 of the buffer. */ 915PyAPI_FUNC(void) 916_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length); 917 918/* Prepare the buffer to write 'length' characters 919 with the specified maximum character. 920 921 Return 0 on success, raise an exception and return -1 on error. */ 922#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ 923 (((MAXCHAR) <= (WRITER)->maxchar \ 924 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ 925 ? 0 \ 926 : (((LENGTH) == 0) \ 927 ? 0 \ 928 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) 929 930/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro 931 instead. */ 932PyAPI_FUNC(int) 933_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 934 Py_ssize_t length, Py_UCS4 maxchar); 935 936PyAPI_FUNC(int) 937_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str); 938 939PyAPI_FUNC(PyObject *) 940_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); 941 942PyAPI_FUNC(void) 943_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); 944#endif 945 946#ifndef Py_LIMITED_API 947/* Format the object based on the format_spec, as defined in PEP 3101 948 (Advanced String Formatting). */ 949PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter( 950 _PyUnicodeWriter *writer, 951 PyObject *obj, 952 PyObject *format_spec, 953 Py_ssize_t start, 954 Py_ssize_t end); 955#endif 956 957PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 958PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 959PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 960 const char *u /* UTF-8 encoded string */ 961 ); 962#ifndef Py_LIMITED_API 963PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 964#endif 965 966/* Use only if you know it's a string */ 967#define PyUnicode_CHECK_INTERNED(op) \ 968 (((PyASCIIObject *)(op))->state.interned) 969 970/* --- wchar_t support for platforms which support it --------------------- */ 971 972#ifdef HAVE_WCHAR_H 973 974/* Create a Unicode Object from the wchar_t buffer w of the given 975 size. 976 977 The buffer is copied into the new object. */ 978 979PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 980 register const wchar_t *w, /* wchar_t buffer */ 981 Py_ssize_t size /* size of buffer */ 982 ); 983 984/* Copies the Unicode Object contents into the wchar_t buffer w. At 985 most size wchar_t characters are copied. 986 987 Note that the resulting wchar_t string may or may not be 988 0-terminated. It is the responsibility of the caller to make sure 989 that the wchar_t string is 0-terminated in case this is required by 990 the application. 991 992 Returns the number of wchar_t characters copied (excluding a 993 possibly trailing 0-termination character) or -1 in case of an 994 error. */ 995 996PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 997 PyObject *unicode, /* Unicode object */ 998 register wchar_t *w, /* wchar_t buffer */ 999 Py_ssize_t size /* size of buffer */ 1000 ); 1001 1002/* Convert the Unicode object to a wide character string. The output string 1003 always ends with a nul character. If size is not NULL, write the number of 1004 wide characters (excluding the null character) into *size. 1005 1006 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) 1007 on success. On error, returns NULL, *size is undefined and raises a 1008 MemoryError. */ 1009 1010PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 1011 PyObject *unicode, /* Unicode object */ 1012 Py_ssize_t *size /* number of characters of the result */ 1013 ); 1014 1015#ifndef Py_LIMITED_API 1016PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); 1017#endif 1018 1019#endif 1020 1021/* --- Unicode ordinals --------------------------------------------------- */ 1022 1023/* Create a Unicode Object from the given Unicode code point ordinal. 1024 1025 The ordinal must be in range(0x110000). A ValueError is 1026 raised in case it is not. 1027 1028*/ 1029 1030PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 1031 1032/* --- Free-list management ----------------------------------------------- */ 1033 1034/* Clear the free list used by the Unicode implementation. 1035 1036 This can be used to release memory used for objects on the free 1037 list back to the Python memory allocator. 1038 1039*/ 1040 1041PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 1042 1043/* === Builtin Codecs ===================================================== 1044 1045 Many of these APIs take two arguments encoding and errors. These 1046 parameters encoding and errors have the same semantics as the ones 1047 of the builtin str() API. 1048 1049 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 1050 1051 Error handling is set by errors which may also be set to NULL 1052 meaning to use the default handling defined for the codec. Default 1053 error handling for all builtin codecs is "strict" (ValueErrors are 1054 raised). 1055 1056 The codecs all use a similar interface. Only deviation from the 1057 generic ones are documented. 1058 1059*/ 1060 1061/* --- Manage the default encoding ---------------------------------------- */ 1062 1063/* Returns a pointer to the default encoding (UTF-8) of the 1064 Unicode object unicode and the size of the encoded representation 1065 in bytes stored in *size. 1066 1067 In case of an error, no *size is set. 1068 1069 This function caches the UTF-8 encoded string in the unicodeobject 1070 and subsequent calls will return the same string. The memory is released 1071 when the unicodeobject is deallocated. 1072 1073 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to 1074 support the previous internal function with the same behaviour. 1075 1076 *** This API is for interpreter INTERNAL USE ONLY and will likely 1077 *** be removed or changed in the future. 1078 1079 *** If you need to access the Unicode object as UTF-8 bytes string, 1080 *** please use PyUnicode_AsUTF8String() instead. 1081*/ 1082 1083#ifndef Py_LIMITED_API 1084PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( 1085 PyObject *unicode, 1086 Py_ssize_t *size); 1087#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize 1088#endif 1089 1090/* Returns a pointer to the default encoding (UTF-8) of the 1091 Unicode object unicode. 1092 1093 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 1094 in the unicodeobject. 1095 1096 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 1097 support the previous internal function with the same behaviour. 1098 1099 Use of this API is DEPRECATED since no size information can be 1100 extracted from the returned data. 1101 1102 *** This API is for interpreter INTERNAL USE ONLY and will likely 1103 *** be removed or changed for Python 3.1. 1104 1105 *** If you need to access the Unicode object as UTF-8 bytes string, 1106 *** please use PyUnicode_AsUTF8String() instead. 1107 1108*/ 1109 1110#ifndef Py_LIMITED_API 1111PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); 1112#define _PyUnicode_AsString PyUnicode_AsUTF8 1113#endif 1114 1115/* Returns "utf-8". */ 1116 1117PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 1118 1119/* --- Generic Codecs ----------------------------------------------------- */ 1120 1121/* Create a Unicode object by decoding the encoded string s of the 1122 given size. */ 1123 1124PyAPI_FUNC(PyObject*) PyUnicode_Decode( 1125 const char *s, /* encoded string */ 1126 Py_ssize_t size, /* size of buffer */ 1127 const char *encoding, /* encoding */ 1128 const char *errors /* error handling */ 1129 ); 1130 1131/* Decode a Unicode object unicode and return the result as Python 1132 object. */ 1133 1134PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 1135 PyObject *unicode, /* Unicode object */ 1136 const char *encoding, /* encoding */ 1137 const char *errors /* error handling */ 1138 ); 1139 1140/* Decode a Unicode object unicode and return the result as Unicode 1141 object. */ 1142 1143PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 1144 PyObject *unicode, /* Unicode object */ 1145 const char *encoding, /* encoding */ 1146 const char *errors /* error handling */ 1147 ); 1148 1149/* Encodes a Py_UNICODE buffer of the given size and returns a 1150 Python string object. */ 1151 1152#ifndef Py_LIMITED_API 1153PyAPI_FUNC(PyObject*) PyUnicode_Encode( 1154 const Py_UNICODE *s, /* Unicode char buffer */ 1155 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 1156 const char *encoding, /* encoding */ 1157 const char *errors /* error handling */ 1158 ); 1159#endif 1160 1161/* Encodes a Unicode object and returns the result as Python 1162 object. */ 1163 1164PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 1165 PyObject *unicode, /* Unicode object */ 1166 const char *encoding, /* encoding */ 1167 const char *errors /* error handling */ 1168 ); 1169 1170/* Encodes a Unicode object and returns the result as Python string 1171 object. */ 1172 1173PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 1174 PyObject *unicode, /* Unicode object */ 1175 const char *encoding, /* encoding */ 1176 const char *errors /* error handling */ 1177 ); 1178 1179/* Encodes a Unicode object and returns the result as Unicode 1180 object. */ 1181 1182PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 1183 PyObject *unicode, /* Unicode object */ 1184 const char *encoding, /* encoding */ 1185 const char *errors /* error handling */ 1186 ); 1187 1188/* Build an encoding map. */ 1189 1190PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 1191 PyObject* string /* 256 character map */ 1192 ); 1193 1194/* --- UTF-7 Codecs ------------------------------------------------------- */ 1195 1196PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 1197 const char *string, /* UTF-7 encoded string */ 1198 Py_ssize_t length, /* size of string */ 1199 const char *errors /* error handling */ 1200 ); 1201 1202PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 1203 const char *string, /* UTF-7 encoded string */ 1204 Py_ssize_t length, /* size of string */ 1205 const char *errors, /* error handling */ 1206 Py_ssize_t *consumed /* bytes consumed */ 1207 ); 1208 1209#ifndef Py_LIMITED_API 1210PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 1211 const Py_UNICODE *data, /* Unicode char buffer */ 1212 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1213 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1214 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1215 const char *errors /* error handling */ 1216 ); 1217PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( 1218 PyObject *unicode, /* Unicode object */ 1219 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1220 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1221 const char *errors /* error handling */ 1222 ); 1223#endif 1224 1225/* --- UTF-8 Codecs ------------------------------------------------------- */ 1226 1227PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 1228 const char *string, /* UTF-8 encoded string */ 1229 Py_ssize_t length, /* size of string */ 1230 const char *errors /* error handling */ 1231 ); 1232 1233PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 1234 const char *string, /* UTF-8 encoded string */ 1235 Py_ssize_t length, /* size of string */ 1236 const char *errors, /* error handling */ 1237 Py_ssize_t *consumed /* bytes consumed */ 1238 ); 1239 1240PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 1241 PyObject *unicode /* Unicode object */ 1242 ); 1243 1244#ifndef Py_LIMITED_API 1245PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 1246 PyObject *unicode, 1247 const char *errors); 1248 1249PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 1250 const Py_UNICODE *data, /* Unicode char buffer */ 1251 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1252 const char *errors /* error handling */ 1253 ); 1254#endif 1255 1256/* --- UTF-32 Codecs ------------------------------------------------------ */ 1257 1258/* Decodes length bytes from a UTF-32 encoded buffer string and returns 1259 the corresponding Unicode object. 1260 1261 errors (if non-NULL) defines the error handling. It defaults 1262 to "strict". 1263 1264 If byteorder is non-NULL, the decoder starts decoding using the 1265 given byte order: 1266 1267 *byteorder == -1: little endian 1268 *byteorder == 0: native order 1269 *byteorder == 1: big endian 1270 1271 In native mode, the first four bytes of the stream are checked for a 1272 BOM mark. If found, the BOM mark is analysed, the byte order 1273 adjusted and the BOM skipped. In the other modes, no BOM mark 1274 interpretation is done. After completion, *byteorder is set to the 1275 current byte order at the end of input data. 1276 1277 If byteorder is NULL, the codec starts in native order mode. 1278 1279*/ 1280 1281PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 1282 const char *string, /* UTF-32 encoded string */ 1283 Py_ssize_t length, /* size of string */ 1284 const char *errors, /* error handling */ 1285 int *byteorder /* pointer to byteorder to use 1286 0=native;-1=LE,1=BE; updated on 1287 exit */ 1288 ); 1289 1290PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 1291 const char *string, /* UTF-32 encoded string */ 1292 Py_ssize_t length, /* size of string */ 1293 const char *errors, /* error handling */ 1294 int *byteorder, /* pointer to byteorder to use 1295 0=native;-1=LE,1=BE; updated on 1296 exit */ 1297 Py_ssize_t *consumed /* bytes consumed */ 1298 ); 1299 1300/* Returns a Python string using the UTF-32 encoding in native byte 1301 order. The string always starts with a BOM mark. */ 1302 1303PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 1304 PyObject *unicode /* Unicode object */ 1305 ); 1306 1307/* Returns a Python string object holding the UTF-32 encoded value of 1308 the Unicode data. 1309 1310 If byteorder is not 0, output is written according to the following 1311 byte order: 1312 1313 byteorder == -1: little endian 1314 byteorder == 0: native byte order (writes a BOM mark) 1315 byteorder == 1: big endian 1316 1317 If byteorder is 0, the output string will always start with the 1318 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1319 prepended. 1320 1321*/ 1322 1323#ifndef Py_LIMITED_API 1324PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 1325 const Py_UNICODE *data, /* Unicode char buffer */ 1326 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1327 const char *errors, /* error handling */ 1328 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1329 ); 1330PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( 1331 PyObject *object, /* Unicode object */ 1332 const char *errors, /* error handling */ 1333 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1334 ); 1335#endif 1336 1337/* --- UTF-16 Codecs ------------------------------------------------------ */ 1338 1339/* Decodes length bytes from a UTF-16 encoded buffer string and returns 1340 the corresponding Unicode object. 1341 1342 errors (if non-NULL) defines the error handling. It defaults 1343 to "strict". 1344 1345 If byteorder is non-NULL, the decoder starts decoding using the 1346 given byte order: 1347 1348 *byteorder == -1: little endian 1349 *byteorder == 0: native order 1350 *byteorder == 1: big endian 1351 1352 In native mode, the first two bytes of the stream are checked for a 1353 BOM mark. If found, the BOM mark is analysed, the byte order 1354 adjusted and the BOM skipped. In the other modes, no BOM mark 1355 interpretation is done. After completion, *byteorder is set to the 1356 current byte order at the end of input data. 1357 1358 If byteorder is NULL, the codec starts in native order mode. 1359 1360*/ 1361 1362PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 1363 const char *string, /* UTF-16 encoded string */ 1364 Py_ssize_t length, /* size of string */ 1365 const char *errors, /* error handling */ 1366 int *byteorder /* pointer to byteorder to use 1367 0=native;-1=LE,1=BE; updated on 1368 exit */ 1369 ); 1370 1371PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 1372 const char *string, /* UTF-16 encoded string */ 1373 Py_ssize_t length, /* size of string */ 1374 const char *errors, /* error handling */ 1375 int *byteorder, /* pointer to byteorder to use 1376 0=native;-1=LE,1=BE; updated on 1377 exit */ 1378 Py_ssize_t *consumed /* bytes consumed */ 1379 ); 1380 1381/* Returns a Python string using the UTF-16 encoding in native byte 1382 order. The string always starts with a BOM mark. */ 1383 1384PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 1385 PyObject *unicode /* Unicode object */ 1386 ); 1387 1388/* Returns a Python string object holding the UTF-16 encoded value of 1389 the Unicode data. 1390 1391 If byteorder is not 0, output is written according to the following 1392 byte order: 1393 1394 byteorder == -1: little endian 1395 byteorder == 0: native byte order (writes a BOM mark) 1396 byteorder == 1: big endian 1397 1398 If byteorder is 0, the output string will always start with the 1399 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1400 prepended. 1401 1402 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 1403 UCS-2. This trick makes it possible to add full UTF-16 capabilities 1404 at a later point without compromising the APIs. 1405 1406*/ 1407 1408#ifndef Py_LIMITED_API 1409PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 1410 const Py_UNICODE *data, /* Unicode char buffer */ 1411 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1412 const char *errors, /* error handling */ 1413 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1414 ); 1415PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( 1416 PyObject* unicode, /* Unicode object */ 1417 const char *errors, /* error handling */ 1418 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1419 ); 1420#endif 1421 1422/* --- Unicode-Escape Codecs ---------------------------------------------- */ 1423 1424PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1425 const char *string, /* Unicode-Escape encoded string */ 1426 Py_ssize_t length, /* size of string */ 1427 const char *errors /* error handling */ 1428 ); 1429 1430PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1431 PyObject *unicode /* Unicode object */ 1432 ); 1433 1434#ifndef Py_LIMITED_API 1435PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1436 const Py_UNICODE *data, /* Unicode char buffer */ 1437 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1438 ); 1439#endif 1440 1441/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1442 1443PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1444 const char *string, /* Raw-Unicode-Escape encoded string */ 1445 Py_ssize_t length, /* size of string */ 1446 const char *errors /* error handling */ 1447 ); 1448 1449PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1450 PyObject *unicode /* Unicode object */ 1451 ); 1452 1453#ifndef Py_LIMITED_API 1454PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1455 const Py_UNICODE *data, /* Unicode char buffer */ 1456 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1457 ); 1458#endif 1459 1460/* --- Unicode Internal Codec --------------------------------------------- 1461 1462 Only for internal use in _codecsmodule.c */ 1463 1464#ifndef Py_LIMITED_API 1465PyObject *_PyUnicode_DecodeUnicodeInternal( 1466 const char *string, 1467 Py_ssize_t length, 1468 const char *errors 1469 ); 1470#endif 1471 1472/* --- Latin-1 Codecs ----------------------------------------------------- 1473 1474 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1475 1476*/ 1477 1478PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1479 const char *string, /* Latin-1 encoded string */ 1480 Py_ssize_t length, /* size of string */ 1481 const char *errors /* error handling */ 1482 ); 1483 1484PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1485 PyObject *unicode /* Unicode object */ 1486 ); 1487 1488#ifndef Py_LIMITED_API 1489PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 1490 PyObject* unicode, 1491 const char* errors); 1492 1493PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1494 const Py_UNICODE *data, /* Unicode char buffer */ 1495 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1496 const char *errors /* error handling */ 1497 ); 1498#endif 1499 1500/* --- ASCII Codecs ------------------------------------------------------- 1501 1502 Only 7-bit ASCII data is excepted. All other codes generate errors. 1503 1504*/ 1505 1506PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1507 const char *string, /* ASCII encoded string */ 1508 Py_ssize_t length, /* size of string */ 1509 const char *errors /* error handling */ 1510 ); 1511 1512PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1513 PyObject *unicode /* Unicode object */ 1514 ); 1515 1516#ifndef Py_LIMITED_API 1517PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 1518 PyObject* unicode, 1519 const char* errors); 1520 1521PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1522 const Py_UNICODE *data, /* Unicode char buffer */ 1523 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1524 const char *errors /* error handling */ 1525 ); 1526#endif 1527 1528/* --- Character Map Codecs ----------------------------------------------- 1529 1530 This codec uses mappings to encode and decode characters. 1531 1532 Decoding mappings must map single string characters to single 1533 Unicode characters, integers (which are then interpreted as Unicode 1534 ordinals) or None (meaning "undefined mapping" and causing an 1535 error). 1536 1537 Encoding mappings must map single Unicode characters to single 1538 string characters, integers (which are then interpreted as Latin-1 1539 ordinals) or None (meaning "undefined mapping" and causing an 1540 error). 1541 1542 If a character lookup fails with a LookupError, the character is 1543 copied as-is meaning that its ordinal value will be interpreted as 1544 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1545 to contain those mappings which map characters to different code 1546 points. 1547 1548*/ 1549 1550PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1551 const char *string, /* Encoded string */ 1552 Py_ssize_t length, /* size of string */ 1553 PyObject *mapping, /* character mapping 1554 (char ordinal -> unicode ordinal) */ 1555 const char *errors /* error handling */ 1556 ); 1557 1558PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1559 PyObject *unicode, /* Unicode object */ 1560 PyObject *mapping /* character mapping 1561 (unicode ordinal -> char ordinal) */ 1562 ); 1563 1564#ifndef Py_LIMITED_API 1565PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1566 const Py_UNICODE *data, /* Unicode char buffer */ 1567 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1568 PyObject *mapping, /* character mapping 1569 (unicode ordinal -> char ordinal) */ 1570 const char *errors /* error handling */ 1571 ); 1572PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( 1573 PyObject *unicode, /* Unicode object */ 1574 PyObject *mapping, /* character mapping 1575 (unicode ordinal -> char ordinal) */ 1576 const char *errors /* error handling */ 1577 ); 1578#endif 1579 1580/* Translate a Py_UNICODE buffer of the given length by applying a 1581 character mapping table to it and return the resulting Unicode 1582 object. 1583 1584 The mapping table must map Unicode ordinal integers to Unicode 1585 ordinal integers or None (causing deletion of the character). 1586 1587 Mapping tables may be dictionaries or sequences. Unmapped character 1588 ordinals (ones which cause a LookupError) are left untouched and 1589 are copied as-is. 1590 1591*/ 1592 1593#ifndef Py_LIMITED_API 1594PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1595 const Py_UNICODE *data, /* Unicode char buffer */ 1596 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1597 PyObject *table, /* Translate table */ 1598 const char *errors /* error handling */ 1599 ); 1600#endif 1601 1602#ifdef HAVE_MBCS 1603 1604/* --- MBCS codecs for Windows -------------------------------------------- */ 1605 1606PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1607 const char *string, /* MBCS encoded string */ 1608 Py_ssize_t length, /* size of string */ 1609 const char *errors /* error handling */ 1610 ); 1611 1612PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1613 const char *string, /* MBCS encoded string */ 1614 Py_ssize_t length, /* size of string */ 1615 const char *errors, /* error handling */ 1616 Py_ssize_t *consumed /* bytes consumed */ 1617 ); 1618 1619PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 1620 int code_page, /* code page number */ 1621 const char *string, /* encoded string */ 1622 Py_ssize_t length, /* size of string */ 1623 const char *errors, /* error handling */ 1624 Py_ssize_t *consumed /* bytes consumed */ 1625 ); 1626 1627PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1628 PyObject *unicode /* Unicode object */ 1629 ); 1630 1631#ifndef Py_LIMITED_API 1632PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1633 const Py_UNICODE *data, /* Unicode char buffer */ 1634 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1635 const char *errors /* error handling */ 1636 ); 1637#endif 1638 1639PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 1640 int code_page, /* code page number */ 1641 PyObject *unicode, /* Unicode object */ 1642 const char *errors /* error handling */ 1643 ); 1644 1645#endif /* HAVE_MBCS */ 1646 1647/* --- Decimal Encoder ---------------------------------------------------- */ 1648 1649/* Takes a Unicode string holding a decimal value and writes it into 1650 an output buffer using standard ASCII digit codes. 1651 1652 The output buffer has to provide at least length+1 bytes of storage 1653 area. The output string is 0-terminated. 1654 1655 The encoder converts whitespace to ' ', decimal characters to their 1656 corresponding ASCII digit and all other Latin-1 characters except 1657 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1658 are treated as errors. This includes embedded NULL bytes. 1659 1660 Error handling is defined by the errors argument: 1661 1662 NULL or "strict": raise a ValueError 1663 "ignore": ignore the wrong characters (these are not copied to the 1664 output buffer) 1665 "replace": replaces illegal characters with '?' 1666 1667 Returns 0 on success, -1 on failure. 1668 1669*/ 1670 1671#ifndef Py_LIMITED_API 1672PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1673 Py_UNICODE *s, /* Unicode buffer */ 1674 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1675 char *output, /* Output buffer; must have size >= length */ 1676 const char *errors /* error handling */ 1677 ); 1678#endif 1679 1680/* Transforms code points that have decimal digit property to the 1681 corresponding ASCII digit code points. 1682 1683 Returns a new Unicode string on success, NULL on failure. 1684*/ 1685 1686#ifndef Py_LIMITED_API 1687PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( 1688 Py_UNICODE *s, /* Unicode buffer */ 1689 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ 1690 ); 1691#endif 1692 1693/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject 1694 as argument instead of a raw buffer and length. This function additionally 1695 transforms spaces to ASCII because this is what the callers in longobject, 1696 floatobject, and complexobject did anyways. */ 1697 1698#ifndef Py_LIMITED_API 1699PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 1700 PyObject *unicode /* Unicode object */ 1701 ); 1702#endif 1703 1704/* --- Locale encoding --------------------------------------------------- */ 1705 1706/* Decode a string from the current locale encoding. The decoder is strict if 1707 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 1708 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 1709 be decoded as a surrogate character and *surrogateescape* is not equal to 1710 zero, the byte sequence is escaped using the 'surrogateescape' error handler 1711 instead of being decoded. *str* must end with a null character but cannot 1712 contain embedded null characters. */ 1713 1714PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 1715 const char *str, 1716 Py_ssize_t len, 1717 const char *errors); 1718 1719/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 1720 length using strlen(). */ 1721 1722PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 1723 const char *str, 1724 const char *errors); 1725 1726/* Encode a Unicode object to the current locale encoding. The encoder is 1727 strict is *surrogateescape* is equal to zero, otherwise the 1728 "surrogateescape" error handler is used. Return a bytes object. The string 1729 cannot contain embedded null characters.. */ 1730 1731PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 1732 PyObject *unicode, 1733 const char *errors 1734 ); 1735 1736/* --- File system encoding ---------------------------------------------- */ 1737 1738/* ParseTuple converter: encode str objects to bytes using 1739 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1740 1741PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1742 1743/* ParseTuple converter: decode bytes objects to unicode using 1744 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1745 1746PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1747 1748/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1749 and the "surrogateescape" error handler. 1750 1751 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1752 encoding. 1753 1754 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1755*/ 1756 1757PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1758 const char *s /* encoded string */ 1759 ); 1760 1761/* Decode a string using Py_FileSystemDefaultEncoding 1762 and the "surrogateescape" error handler. 1763 1764 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1765 encoding. 1766*/ 1767 1768PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1769 const char *s, /* encoded string */ 1770 Py_ssize_t size /* size */ 1771 ); 1772 1773/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1774 "surrogateescape" error handler, and return bytes. 1775 1776 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1777 encoding. 1778*/ 1779 1780PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1781 PyObject *unicode 1782 ); 1783 1784/* --- Methods & Slots ---------------------------------------------------- 1785 1786 These are capable of handling Unicode objects and strings on input 1787 (we refer to them as strings in the descriptions) and return 1788 Unicode objects or integers as appropriate. */ 1789 1790/* Concat two strings giving a new Unicode string. */ 1791 1792PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1793 PyObject *left, /* Left string */ 1794 PyObject *right /* Right string */ 1795 ); 1796 1797/* Concat two strings and put the result in *pleft 1798 (sets *pleft to NULL on error) */ 1799 1800PyAPI_FUNC(void) PyUnicode_Append( 1801 PyObject **pleft, /* Pointer to left string */ 1802 PyObject *right /* Right string */ 1803 ); 1804 1805/* Concat two strings, put the result in *pleft and drop the right object 1806 (sets *pleft to NULL on error) */ 1807 1808PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1809 PyObject **pleft, /* Pointer to left string */ 1810 PyObject *right /* Right string */ 1811 ); 1812 1813/* Split a string giving a list of Unicode strings. 1814 1815 If sep is NULL, splitting will be done at all whitespace 1816 substrings. Otherwise, splits occur at the given separator. 1817 1818 At most maxsplit splits will be done. If negative, no limit is set. 1819 1820 Separators are not included in the resulting list. 1821 1822*/ 1823 1824PyAPI_FUNC(PyObject*) PyUnicode_Split( 1825 PyObject *s, /* String to split */ 1826 PyObject *sep, /* String separator */ 1827 Py_ssize_t maxsplit /* Maxsplit count */ 1828 ); 1829 1830/* Dito, but split at line breaks. 1831 1832 CRLF is considered to be one line break. Line breaks are not 1833 included in the resulting list. */ 1834 1835PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1836 PyObject *s, /* String to split */ 1837 int keepends /* If true, line end markers are included */ 1838 ); 1839 1840/* Partition a string using a given separator. */ 1841 1842PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1843 PyObject *s, /* String to partition */ 1844 PyObject *sep /* String separator */ 1845 ); 1846 1847/* Partition a string using a given separator, searching from the end of the 1848 string. */ 1849 1850PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1851 PyObject *s, /* String to partition */ 1852 PyObject *sep /* String separator */ 1853 ); 1854 1855/* Split a string giving a list of Unicode strings. 1856 1857 If sep is NULL, splitting will be done at all whitespace 1858 substrings. Otherwise, splits occur at the given separator. 1859 1860 At most maxsplit splits will be done. But unlike PyUnicode_Split 1861 PyUnicode_RSplit splits from the end of the string. If negative, 1862 no limit is set. 1863 1864 Separators are not included in the resulting list. 1865 1866*/ 1867 1868PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1869 PyObject *s, /* String to split */ 1870 PyObject *sep, /* String separator */ 1871 Py_ssize_t maxsplit /* Maxsplit count */ 1872 ); 1873 1874/* Translate a string by applying a character mapping table to it and 1875 return the resulting Unicode object. 1876 1877 The mapping table must map Unicode ordinal integers to Unicode 1878 ordinal integers or None (causing deletion of the character). 1879 1880 Mapping tables may be dictionaries or sequences. Unmapped character 1881 ordinals (ones which cause a LookupError) are left untouched and 1882 are copied as-is. 1883 1884*/ 1885 1886PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1887 PyObject *str, /* String */ 1888 PyObject *table, /* Translate table */ 1889 const char *errors /* error handling */ 1890 ); 1891 1892/* Join a sequence of strings using the given separator and return 1893 the resulting Unicode string. */ 1894 1895PyAPI_FUNC(PyObject*) PyUnicode_Join( 1896 PyObject *separator, /* Separator string */ 1897 PyObject *seq /* Sequence object */ 1898 ); 1899 1900/* Return 1 if substr matches str[start:end] at the given tail end, 0 1901 otherwise. */ 1902 1903PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1904 PyObject *str, /* String */ 1905 PyObject *substr, /* Prefix or Suffix string */ 1906 Py_ssize_t start, /* Start index */ 1907 Py_ssize_t end, /* Stop index */ 1908 int direction /* Tail end: -1 prefix, +1 suffix */ 1909 ); 1910 1911/* Return the first position of substr in str[start:end] using the 1912 given search direction or -1 if not found. -2 is returned in case 1913 an error occurred and an exception is set. */ 1914 1915PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1916 PyObject *str, /* String */ 1917 PyObject *substr, /* Substring to find */ 1918 Py_ssize_t start, /* Start index */ 1919 Py_ssize_t end, /* Stop index */ 1920 int direction /* Find direction: +1 forward, -1 backward */ 1921 ); 1922 1923/* Like PyUnicode_Find, but search for single character only. */ 1924PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 1925 PyObject *str, 1926 Py_UCS4 ch, 1927 Py_ssize_t start, 1928 Py_ssize_t end, 1929 int direction 1930 ); 1931 1932/* Count the number of occurrences of substr in str[start:end]. */ 1933 1934PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1935 PyObject *str, /* String */ 1936 PyObject *substr, /* Substring to count */ 1937 Py_ssize_t start, /* Start index */ 1938 Py_ssize_t end /* Stop index */ 1939 ); 1940 1941/* Replace at most maxcount occurrences of substr in str with replstr 1942 and return the resulting Unicode object. */ 1943 1944PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1945 PyObject *str, /* String */ 1946 PyObject *substr, /* Substring to find */ 1947 PyObject *replstr, /* Substring to replace */ 1948 Py_ssize_t maxcount /* Max. number of replacements to apply; 1949 -1 = all */ 1950 ); 1951 1952/* Compare two strings and return -1, 0, 1 for less than, equal, 1953 greater than resp. 1954 Raise an exception and return -1 on error. */ 1955 1956PyAPI_FUNC(int) PyUnicode_Compare( 1957 PyObject *left, /* Left string */ 1958 PyObject *right /* Right string */ 1959 ); 1960 1961PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1962 PyObject *left, 1963 const char *right /* ASCII-encoded string */ 1964 ); 1965 1966/* Rich compare two strings and return one of the following: 1967 1968 - NULL in case an exception was raised 1969 - Py_True or Py_False for successfully comparisons 1970 - Py_NotImplemented in case the type combination is unknown 1971 1972 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1973 case the conversion of the arguments to Unicode fails with a 1974 UnicodeDecodeError. 1975 1976 Possible values for op: 1977 1978 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1979 1980*/ 1981 1982PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1983 PyObject *left, /* Left string */ 1984 PyObject *right, /* Right string */ 1985 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1986 ); 1987 1988/* Apply a argument tuple or dictionary to a format string and return 1989 the resulting Unicode string. */ 1990 1991PyAPI_FUNC(PyObject *) PyUnicode_Format( 1992 PyObject *format, /* Format string */ 1993 PyObject *args /* Argument tuple or dictionary */ 1994 ); 1995 1996/* Checks whether element is contained in container and return 1/0 1997 accordingly. 1998 1999 element has to coerce to an one element Unicode string. -1 is 2000 returned in case of an error. */ 2001 2002PyAPI_FUNC(int) PyUnicode_Contains( 2003 PyObject *container, /* Container string */ 2004 PyObject *element /* Element string */ 2005 ); 2006 2007/* Checks whether the string contains any NUL characters. */ 2008 2009#ifndef Py_LIMITED_API 2010PyAPI_FUNC(int) _PyUnicode_HasNULChars(PyObject *); 2011#endif 2012 2013/* Checks whether argument is a valid identifier. */ 2014 2015PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 2016 2017#ifndef Py_LIMITED_API 2018/* Externally visible for str.strip(unicode) */ 2019PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 2020 PyObject *self, 2021 int striptype, 2022 PyObject *sepobj 2023 ); 2024#endif 2025 2026/* Using explicit passed-in values, insert the thousands grouping 2027 into the string pointed to by buffer. For the argument descriptions, 2028 see Objects/stringlib/localeutil.h */ 2029#ifndef Py_LIMITED_API 2030PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 2031 PyObject *unicode, 2032 Py_ssize_t index, 2033 Py_ssize_t n_buffer, 2034 void *digits, 2035 Py_ssize_t n_digits, 2036 Py_ssize_t min_width, 2037 const char *grouping, 2038 PyObject *thousands_sep, 2039 Py_UCS4 *maxchar); 2040#endif 2041/* === Characters Type APIs =============================================== */ 2042 2043/* Helper array used by Py_UNICODE_ISSPACE(). */ 2044 2045#ifndef Py_LIMITED_API 2046PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 2047 2048/* These should not be used directly. Use the Py_UNICODE_IS* and 2049 Py_UNICODE_TO* macros instead. 2050 2051 These APIs are implemented in Objects/unicodectype.c. 2052 2053*/ 2054 2055PyAPI_FUNC(int) _PyUnicode_IsLowercase( 2056 Py_UCS4 ch /* Unicode character */ 2057 ); 2058 2059PyAPI_FUNC(int) _PyUnicode_IsUppercase( 2060 Py_UCS4 ch /* Unicode character */ 2061 ); 2062 2063PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 2064 Py_UCS4 ch /* Unicode character */ 2065 ); 2066 2067PyAPI_FUNC(int) _PyUnicode_IsXidStart( 2068 Py_UCS4 ch /* Unicode character */ 2069 ); 2070 2071PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 2072 Py_UCS4 ch /* Unicode character */ 2073 ); 2074 2075PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 2076 const Py_UCS4 ch /* Unicode character */ 2077 ); 2078 2079PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 2080 const Py_UCS4 ch /* Unicode character */ 2081 ); 2082 2083PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 2084 Py_UCS4 ch /* Unicode character */ 2085 ); 2086 2087PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 2088 Py_UCS4 ch /* Unicode character */ 2089 ); 2090 2091PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 2092 Py_UCS4 ch /* Unicode character */ 2093 ); 2094 2095PyAPI_FUNC(int) _PyUnicode_ToLowerFull( 2096 Py_UCS4 ch, /* Unicode character */ 2097 Py_UCS4 *res 2098 ); 2099 2100PyAPI_FUNC(int) _PyUnicode_ToTitleFull( 2101 Py_UCS4 ch, /* Unicode character */ 2102 Py_UCS4 *res 2103 ); 2104 2105PyAPI_FUNC(int) _PyUnicode_ToUpperFull( 2106 Py_UCS4 ch, /* Unicode character */ 2107 Py_UCS4 *res 2108 ); 2109 2110PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( 2111 Py_UCS4 ch, /* Unicode character */ 2112 Py_UCS4 *res 2113 ); 2114 2115PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( 2116 Py_UCS4 ch /* Unicode character */ 2117 ); 2118 2119PyAPI_FUNC(int) _PyUnicode_IsCased( 2120 Py_UCS4 ch /* Unicode character */ 2121 ); 2122 2123PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 2124 Py_UCS4 ch /* Unicode character */ 2125 ); 2126 2127PyAPI_FUNC(int) _PyUnicode_ToDigit( 2128 Py_UCS4 ch /* Unicode character */ 2129 ); 2130 2131PyAPI_FUNC(double) _PyUnicode_ToNumeric( 2132 Py_UCS4 ch /* Unicode character */ 2133 ); 2134 2135PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 2136 Py_UCS4 ch /* Unicode character */ 2137 ); 2138 2139PyAPI_FUNC(int) _PyUnicode_IsDigit( 2140 Py_UCS4 ch /* Unicode character */ 2141 ); 2142 2143PyAPI_FUNC(int) _PyUnicode_IsNumeric( 2144 Py_UCS4 ch /* Unicode character */ 2145 ); 2146 2147PyAPI_FUNC(int) _PyUnicode_IsPrintable( 2148 Py_UCS4 ch /* Unicode character */ 2149 ); 2150 2151PyAPI_FUNC(int) _PyUnicode_IsAlpha( 2152 Py_UCS4 ch /* Unicode character */ 2153 ); 2154 2155PyAPI_FUNC(size_t) Py_UNICODE_strlen( 2156 const Py_UNICODE *u 2157 ); 2158 2159PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 2160 Py_UNICODE *s1, 2161 const Py_UNICODE *s2); 2162 2163PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 2164 Py_UNICODE *s1, const Py_UNICODE *s2); 2165 2166PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 2167 Py_UNICODE *s1, 2168 const Py_UNICODE *s2, 2169 size_t n); 2170 2171PyAPI_FUNC(int) Py_UNICODE_strcmp( 2172 const Py_UNICODE *s1, 2173 const Py_UNICODE *s2 2174 ); 2175 2176PyAPI_FUNC(int) Py_UNICODE_strncmp( 2177 const Py_UNICODE *s1, 2178 const Py_UNICODE *s2, 2179 size_t n 2180 ); 2181 2182PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 2183 const Py_UNICODE *s, 2184 Py_UNICODE c 2185 ); 2186 2187PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 2188 const Py_UNICODE *s, 2189 Py_UNICODE c 2190 ); 2191 2192/* Create a copy of a unicode string ending with a nul character. Return NULL 2193 and raise a MemoryError exception on memory allocation failure, otherwise 2194 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 2195 2196PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 2197 PyObject *unicode 2198 ); 2199#endif /* Py_LIMITED_API */ 2200 2201#if defined(Py_DEBUG) && !defined(Py_LIMITED_API) 2202PyAPI_FUNC(int) _PyUnicode_CheckConsistency( 2203 PyObject *op, 2204 int check_content); 2205#endif 2206 2207/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ 2208PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); 2209/* Clear all static strings. */ 2210PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); 2211 2212#ifdef __cplusplus 2213} 2214#endif 2215#endif /* !Py_UNICODEOBJECT_H */ 2216