unicodeobject.h revision ece58deb9fd72674b84ef7a01c944b5eed6b37a1
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal. (See 11http://www.egenix.com/files/python/unicode-proposal.txt). 12 13Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58#include <ctype.h> 59 60/* === Internal API ======================================================= */ 61 62/* --- Internal Unicode Format -------------------------------------------- */ 63 64/* Python 3.x requires unicode */ 65#define Py_USING_UNICODE 66 67#ifndef SIZEOF_WCHAR_T 68#error Must define SIZEOF_WCHAR_T 69#endif 70 71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Py_UNICODE was the native Unicode storage format (code unit) used by 87 Python and represents a single Unicode element in the Unicode type. 88 With PEP 393, Py_UNICODE is deprecated and replaced with a 89 typedef to wchar_t. */ 90 91#ifndef Py_LIMITED_API 92#define PY_UNICODE_TYPE wchar_t 93typedef wchar_t Py_UNICODE; 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar(), 98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#if defined(MS_WINDOWS) 107# define HAVE_MBCS 108#endif 109 110#ifdef HAVE_WCHAR_H 111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 112# ifdef _HAVE_BSDI 113# include <time.h> 114# endif 115# include <wchar.h> 116#endif 117 118/* Py_UCS4 and Py_UCS2 are typedefs for the respective 119 unicode representations. */ 120#if SIZEOF_INT == 4 121typedef unsigned int Py_UCS4; 122#elif SIZEOF_LONG == 4 123typedef unsigned long Py_UCS4; 124#else 125#error "Could not find a proper typedef for Py_UCS4" 126#endif 127 128#if SIZEOF_SHORT == 2 129typedef unsigned short Py_UCS2; 130#else 131#error "Could not find a proper typedef for Py_UCS2" 132#endif 133 134typedef unsigned char Py_UCS1; 135 136/* --- Internal Unicode Operations ---------------------------------------- */ 137 138/* Since splitting on whitespace is an important use case, and 139 whitespace in most situations is solely ASCII whitespace, we 140 optimize for the common case by using a quick look-up table 141 _Py_ascii_whitespace (see below) with an inlined check. 142 143 */ 144#ifndef Py_LIMITED_API 145#define Py_UNICODE_ISSPACE(ch) \ 146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 147 148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 152 153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 156 157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 161 162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 165 166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 167 168#define Py_UNICODE_ISALNUM(ch) \ 169 (Py_UNICODE_ISALPHA(ch) || \ 170 Py_UNICODE_ISDECIMAL(ch) || \ 171 Py_UNICODE_ISDIGIT(ch) || \ 172 Py_UNICODE_ISNUMERIC(ch)) 173 174#define Py_UNICODE_COPY(target, source, length) \ 175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 176 177#define Py_UNICODE_FILL(target, value, length) \ 178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 180 } while (0) 181 182/* macros to work with surrogates */ 183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF) 184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 186/* Join two surrogate characters and return a single Py_UCS4 value. */ 187#define Py_UNICODE_JOIN_SURROGATES(high, low) \ 188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 190/* high surrogate = top 10 bits added to D800 */ 191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 | (((ch) - 0x10000) >> 10)) 192/* low surrogate = bottom 10 bits added to DC00 */ 193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 | (((ch) - 0x10000) & 0x3FF)) 194 195/* Check if substring matches at given offset. The offset must be 196 valid, and the substring must not be empty. */ 197 198#define Py_UNICODE_MATCH(string, offset, substring) \ 199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ 200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ 201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) 202 203#endif /* Py_LIMITED_API */ 204 205#ifdef __cplusplus 206extern "C" { 207#endif 208 209/* --- Unicode Type ------------------------------------------------------- */ 210 211#ifndef Py_LIMITED_API 212 213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 214 structure. state.ascii and state.compact are set, and the data 215 immediately follow the structure. utf8_length and wstr_length can be found 216 in the length field; the utf8 pointer is equal to the data pointer. */ 217typedef struct { 218 /* There are 4 forms of Unicode strings: 219 220 - compact ascii: 221 222 * structure = PyASCIIObject 223 * test: PyUnicode_IS_COMPACT_ASCII(op) 224 * kind = PyUnicode_1BYTE_KIND 225 * compact = 1 226 * ascii = 1 227 * ready = 1 228 * (length is the length of the utf8 and wstr strings) 229 * (data starts just after the structure) 230 * (since ASCII is decoded from UTF-8, the utf8 string are the data) 231 232 - compact: 233 234 * structure = PyCompactUnicodeObject 235 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) 236 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 237 PyUnicode_4BYTE_KIND 238 * compact = 1 239 * ready = 1 240 * ascii = 0 241 * utf8 is not shared with data 242 * utf8_length = 0 if utf8 is NULL 243 * wstr is shared with data and wstr_length=length 244 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 245 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 246 * wstr_length = 0 if wstr is NULL 247 * (data starts just after the structure) 248 249 - legacy string, not ready: 250 251 * structure = PyUnicodeObject 252 * test: kind == PyUnicode_WCHAR_KIND 253 * length = 0 (use wstr_length) 254 * hash = -1 255 * kind = PyUnicode_WCHAR_KIND 256 * compact = 0 257 * ascii = 0 258 * ready = 0 259 * interned = SSTATE_NOT_INTERNED 260 * wstr is not NULL 261 * data.any is NULL 262 * utf8 is NULL 263 * utf8_length = 0 264 265 - legacy string, ready: 266 267 * structure = PyUnicodeObject structure 268 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND 269 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 270 PyUnicode_4BYTE_KIND 271 * compact = 0 272 * ready = 1 273 * data.any is not NULL 274 * utf8 is shared and utf8_length = length with data.any if ascii = 1 275 * utf8_length = 0 if utf8 is NULL 276 * wstr is shared with data.any and wstr_length = length 277 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 278 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 279 * wstr_length = 0 if wstr is NULL 280 281 Compact strings use only one memory block (structure + characters), 282 whereas legacy strings use one block for the structure and one block 283 for characters. 284 285 Legacy strings are created by PyUnicode_FromUnicode() and 286 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready 287 when PyUnicode_READY() is called. 288 289 See also _PyUnicode_CheckConsistency(). 290 */ 291 PyObject_HEAD 292 Py_ssize_t length; /* Number of code points in the string */ 293 Py_hash_t hash; /* Hash value; -1 if not set */ 294 struct { 295 /* 296 SSTATE_NOT_INTERNED (0) 297 SSTATE_INTERNED_MORTAL (1) 298 SSTATE_INTERNED_IMMORTAL (2) 299 300 If interned != SSTATE_NOT_INTERNED, the two references from the 301 dictionary to this object are *not* counted in ob_refcnt. 302 */ 303 unsigned int interned:2; 304 /* Character size: 305 306 - PyUnicode_WCHAR_KIND (0): 307 308 * character type = wchar_t (16 or 32 bits, depending on the 309 platform) 310 311 - PyUnicode_1BYTE_KIND (1): 312 313 * character type = Py_UCS1 (8 bits, unsigned) 314 * all characters are in the range U+0000-U+00FF (latin1) 315 * if ascii is set, all characters are in the range U+0000-U+007F 316 (ASCII), otherwise at least one character is in the range 317 U+0080-U+00FF 318 319 - PyUnicode_2BYTE_KIND (2): 320 321 * character type = Py_UCS2 (16 bits, unsigned) 322 * all characters are in the range U+0000-U+FFFF (BMP) 323 * at least one character is in the range U+0100-U+FFFF 324 325 - PyUnicode_4BYTE_KIND (4): 326 327 * character type = Py_UCS4 (32 bits, unsigned) 328 * all characters are in the range U+0000-U+10FFFF 329 * at least one character is in the range U+10000-U+10FFFF 330 */ 331 unsigned int kind:3; 332 /* Compact is with respect to the allocation scheme. Compact unicode 333 objects only require one memory block while non-compact objects use 334 one block for the PyUnicodeObject struct and another for its data 335 buffer. */ 336 unsigned int compact:1; 337 /* The string only contains characters in the range U+0000-U+007F (ASCII) 338 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is 339 set, use the PyASCIIObject structure. */ 340 unsigned int ascii:1; 341 /* The ready flag indicates whether the object layout is initialized 342 completely. This means that this is either a compact object, or 343 the data pointer is filled out. The bit is redundant, and helps 344 to minimize the test in PyUnicode_IS_READY(). */ 345 unsigned int ready:1; 346 } state; 347 wchar_t *wstr; /* wchar_t representation (null-terminated) */ 348} PyASCIIObject; 349 350/* Non-ASCII strings allocated through PyUnicode_New use the 351 PyCompactUnicodeObject structure. state.compact is set, and the data 352 immediately follow the structure. */ 353typedef struct { 354 PyASCIIObject _base; 355 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 356 * terminating \0. */ 357 char *utf8; /* UTF-8 representation (null-terminated) */ 358 Py_ssize_t wstr_length; /* Number of code points in wstr, possible 359 * surrogates count as two code points. */ 360} PyCompactUnicodeObject; 361 362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the 363 PyUnicodeObject structure. The actual string data is initially in the wstr 364 block, and copied into the data block using _PyUnicode_Ready. */ 365typedef struct { 366 PyCompactUnicodeObject _base; 367 union { 368 void *any; 369 Py_UCS1 *latin1; 370 Py_UCS2 *ucs2; 371 Py_UCS4 *ucs4; 372 } data; /* Canonical, smallest-form Unicode buffer */ 373} PyUnicodeObject; 374#endif 375 376PyAPI_DATA(PyTypeObject) PyUnicode_Type; 377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 378 379#define PyUnicode_Check(op) \ 380 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 382 383/* Fast access macros */ 384#ifndef Py_LIMITED_API 385 386#define PyUnicode_WSTR_LENGTH(op) \ 387 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 388 ((PyASCIIObject*)op)->length : \ 389 ((PyCompactUnicodeObject*)op)->wstr_length) 390 391/* Returns the deprecated Py_UNICODE representation's size in code units 392 (this includes surrogate pairs as 2 units). 393 If the Py_UNICODE representation is not available, it will be computed 394 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ 395 396#define PyUnicode_GET_SIZE(op) \ 397 (assert(PyUnicode_Check(op)), \ 398 (((PyASCIIObject *)(op))->wstr) ? \ 399 PyUnicode_WSTR_LENGTH(op) : \ 400 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ 401 assert(((PyASCIIObject *)(op))->wstr), \ 402 PyUnicode_WSTR_LENGTH(op))) 403 404#define PyUnicode_GET_DATA_SIZE(op) \ 405 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) 406 407/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE 408 representation on demand. Using this macro is very inefficient now, 409 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or 410 use PyUnicode_WRITE() and PyUnicode_READ(). */ 411 412#define PyUnicode_AS_UNICODE(op) \ 413 (assert(PyUnicode_Check(op)), \ 414 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ 415 PyUnicode_AsUnicode((PyObject *)(op))) 416 417#define PyUnicode_AS_DATA(op) \ 418 ((const char *)(PyUnicode_AS_UNICODE(op))) 419 420 421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ 422 423/* Values for PyASCIIObject.state: */ 424 425/* Interning state. */ 426#define SSTATE_NOT_INTERNED 0 427#define SSTATE_INTERNED_MORTAL 1 428#define SSTATE_INTERNED_IMMORTAL 2 429 430/* Return true if the string contains only ASCII characters, or 0 if not. The 431 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be 432 ready. */ 433#define PyUnicode_IS_ASCII(op) \ 434 (assert(PyUnicode_Check(op)), \ 435 assert(PyUnicode_IS_READY(op)), \ 436 ((PyASCIIObject*)op)->state.ascii) 437 438/* Return true if the string is compact or 0 if not. 439 No type checks or Ready calls are performed. */ 440#define PyUnicode_IS_COMPACT(op) \ 441 (((PyASCIIObject*)(op))->state.compact) 442 443/* Return true if the string is a compact ASCII string (use PyASCIIObject 444 structure), or 0 if not. No type checks or Ready calls are performed. */ 445#define PyUnicode_IS_COMPACT_ASCII(op) \ 446 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op)) 447 448enum PyUnicode_Kind { 449/* String contains only wstr byte characters. This is only possible 450 when the string was created with a legacy API and _PyUnicode_Ready() 451 has not been called yet. */ 452 PyUnicode_WCHAR_KIND = 0, 453/* Return values of the PyUnicode_KIND() macro: */ 454 PyUnicode_1BYTE_KIND = 1, 455 PyUnicode_2BYTE_KIND = 2, 456 PyUnicode_4BYTE_KIND = 4 457}; 458 459/* Return pointers to the canonical representation cast to unsigned char, 460 Py_UCS2, or Py_UCS4 for direct character access. 461 No checks are performed, use PyUnicode_KIND() before to ensure 462 these will work correctly. */ 463 464#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) 465#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) 466#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) 467 468/* Return one of the PyUnicode_*_KIND values defined above. */ 469#define PyUnicode_KIND(op) \ 470 (assert(PyUnicode_Check(op)), \ 471 assert(PyUnicode_IS_READY(op)), \ 472 ((PyASCIIObject *)(op))->state.kind) 473 474/* Return a void pointer to the raw unicode buffer. */ 475#define _PyUnicode_COMPACT_DATA(op) \ 476 (PyUnicode_IS_ASCII(op) ? \ 477 ((void*)((PyASCIIObject*)(op) + 1)) : \ 478 ((void*)((PyCompactUnicodeObject*)(op) + 1))) 479 480#define _PyUnicode_NONCOMPACT_DATA(op) \ 481 (assert(((PyUnicodeObject*)(op))->data.any), \ 482 ((((PyUnicodeObject *)(op))->data.any))) 483 484#define PyUnicode_DATA(op) \ 485 (assert(PyUnicode_Check(op)), \ 486 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ 487 _PyUnicode_NONCOMPACT_DATA(op)) 488 489/* In the access macros below, "kind" may be evaluated more than once. 490 All other macro parameters are evaluated exactly once, so it is safe 491 to put side effects into them (such as increasing the index). */ 492 493/* Write into the canonical representation, this macro does not do any sanity 494 checks and is intended for usage in loops. The caller should cache the 495 kind and data pointers obtained from other macro calls. 496 index is the index in the string (starts at 0) and value is the new 497 code point value which should be written to that location. */ 498#define PyUnicode_WRITE(kind, data, index, value) \ 499 do { \ 500 switch ((kind)) { \ 501 case PyUnicode_1BYTE_KIND: { \ 502 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ 503 break; \ 504 } \ 505 case PyUnicode_2BYTE_KIND: { \ 506 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ 507 break; \ 508 } \ 509 default: { \ 510 assert((kind) == PyUnicode_4BYTE_KIND); \ 511 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ 512 } \ 513 } \ 514 } while (0) 515 516/* Read a code point from the string's canonical representation. No checks 517 or ready calls are performed. */ 518#define PyUnicode_READ(kind, data, index) \ 519 ((Py_UCS4) \ 520 ((kind) == PyUnicode_1BYTE_KIND ? \ 521 ((const Py_UCS1 *)(data))[(index)] : \ 522 ((kind) == PyUnicode_2BYTE_KIND ? \ 523 ((const Py_UCS2 *)(data))[(index)] : \ 524 ((const Py_UCS4 *)(data))[(index)] \ 525 ) \ 526 )) 527 528/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 529 calls PyUnicode_KIND() and might call it twice. For single reads, use 530 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 531 cache kind and use PyUnicode_READ instead. */ 532#define PyUnicode_READ_CHAR(unicode, index) \ 533 (assert(PyUnicode_Check(unicode)), \ 534 assert(PyUnicode_IS_READY(unicode)), \ 535 (Py_UCS4) \ 536 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ 537 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ 538 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ 539 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ 540 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ 541 ) \ 542 )) 543 544/* Returns the length of the unicode string. The caller has to make sure that 545 the string has it's canonical representation set before calling 546 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ 547#define PyUnicode_GET_LENGTH(op) \ 548 (assert(PyUnicode_Check(op)), \ 549 assert(PyUnicode_IS_READY(op)), \ 550 ((PyASCIIObject *)(op))->length) 551 552 553/* Fast check to determine whether an object is ready. Equivalent to 554 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ 555 556#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) 557 558/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best 559 case. If the canonical representation is not yet set, it will still call 560 _PyUnicode_Ready(). 561 Returns 0 on success and -1 on errors. */ 562#define PyUnicode_READY(op) \ 563 (assert(PyUnicode_Check(op)), \ 564 (PyUnicode_IS_READY(op) ? \ 565 0 : _PyUnicode_Ready((PyObject *)(op)))) 566 567/* Return a maximum character value which is suitable for creating another 568 string based on op. This is always an approximation but more efficient 569 than iterating over the string. */ 570#define PyUnicode_MAX_CHAR_VALUE(op) \ 571 (assert(PyUnicode_IS_READY(op)), \ 572 (PyUnicode_IS_ASCII(op) ? \ 573 (0x7f) : \ 574 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ 575 (0xffU) : \ 576 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ 577 (0xffffU) : \ 578 (0x10ffffU))))) 579 580#endif 581 582/* --- Constants ---------------------------------------------------------- */ 583 584/* This Unicode character will be used as replacement character during 585 decoding if the errors argument is set to "replace". Note: the 586 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 587 Unicode 3.0. */ 588 589#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 590 591/* === Public API ========================================================= */ 592 593/* --- Plain Py_UNICODE --------------------------------------------------- */ 594 595/* With PEP 393, this is the recommended way to allocate a new unicode object. 596 This function will allocate the object and its buffer in a single memory 597 block. Objects created using this function are not resizable. */ 598#ifndef Py_LIMITED_API 599PyAPI_FUNC(PyObject*) PyUnicode_New( 600 Py_ssize_t size, /* Number of code points in the new string */ 601 Py_UCS4 maxchar /* maximum code point value in the string */ 602 ); 603#endif 604 605/* Initializes the canonical string representation from a the deprecated 606 wstr/Py_UNICODE representation. This function is used to convert Unicode 607 objects which were created using the old API to the new flexible format 608 introduced with PEP 393. 609 610 Don't call this function directly, use the public PyUnicode_READY() macro 611 instead. */ 612#ifndef Py_LIMITED_API 613PyAPI_FUNC(int) _PyUnicode_Ready( 614 PyObject *unicode /* Unicode object */ 615 ); 616#endif 617 618/* Get a copy of a Unicode string. */ 619#ifndef Py_LIMITED_API 620PyAPI_FUNC(PyObject*) _PyUnicode_Copy( 621 PyObject *unicode 622 ); 623#endif 624 625/* Copy character from one unicode object into another, this function performs 626 character conversion when necessary and falls back to memcpy() if possible. 627 628 Fail if to is too small (smaller than *how_many* or smaller than 629 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 630 kind(to), or if *to* has more than 1 reference. 631 632 Return the number of written character, or return -1 and raise an exception 633 on error. 634 635 Pseudo-code: 636 637 how_many = min(how_many, len(from) - from_start) 638 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 639 return how_many 640 641 Note: The function doesn't write a terminating null character. 642 */ 643#ifndef Py_LIMITED_API 644PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 645 PyObject *to, 646 Py_ssize_t to_start, 647 PyObject *from, 648 Py_ssize_t from_start, 649 Py_ssize_t how_many 650 ); 651#endif 652 653/* Fill a string with a character: write fill_char into 654 unicode[start:start+length]. 655 656 Fail if fill_char is bigger than the string maximum character, or if the 657 string has more than 1 reference. 658 659 Return the number of written character, or return -1 and raise an exception 660 on error. */ 661#ifndef Py_LIMITED_API 662PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( 663 PyObject *unicode, 664 Py_ssize_t start, 665 Py_ssize_t length, 666 Py_UCS4 fill_char 667 ); 668#endif 669 670/* Create a Unicode Object from the Py_UNICODE buffer u of the given 671 size. 672 673 u may be NULL which causes the contents to be undefined. It is the 674 user's responsibility to fill in the needed data afterwards. Note 675 that modifying the Unicode object contents after construction is 676 only allowed if u was set to NULL. 677 678 The buffer is copied into the new object. */ 679 680#ifndef Py_LIMITED_API 681PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 682 const Py_UNICODE *u, /* Unicode buffer */ 683 Py_ssize_t size /* size of buffer */ 684 ); 685#endif 686 687/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 688PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 689 const char *u, /* UTF-8 encoded string */ 690 Py_ssize_t size /* size of buffer */ 691 ); 692 693/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 694 UTF-8 encoded bytes. The size is determined with strlen(). */ 695PyAPI_FUNC(PyObject*) PyUnicode_FromString( 696 const char *u /* UTF-8 encoded string */ 697 ); 698 699/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. 700 Scan the string to find the maximum character. */ 701#ifndef Py_LIMITED_API 702PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 703 int kind, 704 const void *buffer, 705 Py_ssize_t size); 706#endif 707 708PyAPI_FUNC(PyObject*) PyUnicode_Substring( 709 PyObject *str, 710 Py_ssize_t start, 711 Py_ssize_t end); 712 713#ifndef Py_LIMITED_API 714/* Compute the maximum character of the substring unicode[start:end]. 715 Return 127 for an empty string. */ 716PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( 717 PyObject *unicode, 718 Py_ssize_t start, 719 Py_ssize_t end); 720#endif 721 722/* Copy the string into a UCS4 buffer including the null character if copy_null 723 is set. Return NULL and raise an exception on error. Raise a ValueError if 724 the buffer is smaller than the string. Return buffer on success. 725 726 buflen is the length of the buffer in (Py_UCS4) characters. */ 727PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 728 PyObject *unicode, 729 Py_UCS4* buffer, 730 Py_ssize_t buflen, 731 int copy_null); 732 733/* Copy the string into a UCS4 buffer. A new buffer is allocated using 734 * PyMem_Malloc; if this fails, NULL is returned with a memory error 735 exception set. */ 736PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 737 738/* Return a read-only pointer to the Unicode object's internal 739 Py_UNICODE buffer. 740 If the wchar_t/Py_UNICODE representation is not yet available, this 741 function will calculate it. */ 742 743#ifndef Py_LIMITED_API 744PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 745 PyObject *unicode /* Unicode object */ 746 ); 747#endif 748 749/* Return a read-only pointer to the Unicode object's internal 750 Py_UNICODE buffer and save the length at size. 751 If the wchar_t/Py_UNICODE representation is not yet available, this 752 function will calculate it. */ 753 754#ifndef Py_LIMITED_API 755PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( 756 PyObject *unicode, /* Unicode object */ 757 Py_ssize_t *size /* location where to save the length */ 758 ); 759#endif 760 761/* Get the length of the Unicode object. */ 762 763PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 764 PyObject *unicode 765); 766 767/* Get the number of Py_UNICODE units in the 768 string representation. */ 769 770PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 771 PyObject *unicode /* Unicode object */ 772 ); 773 774/* Read a character from the string. */ 775 776PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 777 PyObject *unicode, 778 Py_ssize_t index 779 ); 780 781/* Write a character to the string. The string must have been created through 782 PyUnicode_New, must not be shared, and must not have been hashed yet. 783 784 Return 0 on success, -1 on error. */ 785 786PyAPI_FUNC(int) PyUnicode_WriteChar( 787 PyObject *unicode, 788 Py_ssize_t index, 789 Py_UCS4 character 790 ); 791 792#ifndef Py_LIMITED_API 793/* Get the maximum ordinal for a Unicode character. */ 794PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 795#endif 796 797/* Resize an Unicode object. The length is the number of characters, except 798 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 799 is the number of Py_UNICODE characters. 800 801 *unicode is modified to point to the new (resized) object and 0 802 returned on success. 803 804 Try to resize the string in place (which is usually faster than allocating 805 a new string and copy characters), or create a new string. 806 807 Error handling is implemented as follows: an exception is set, -1 808 is returned and *unicode left untouched. 809 810 WARNING: The function doesn't check string content, the result may not be a 811 string in canonical representation. */ 812 813PyAPI_FUNC(int) PyUnicode_Resize( 814 PyObject **unicode, /* Pointer to the Unicode object */ 815 Py_ssize_t length /* New length */ 816 ); 817 818/* Coerce obj to an Unicode object and return a reference with 819 *incremented* refcount. 820 821 Coercion is done in the following way: 822 823 1. bytes, bytearray and other char buffer compatible objects are decoded 824 under the assumptions that they contain data using the UTF-8 825 encoding. Decoding is done in "strict" mode. 826 827 2. All other objects (including Unicode objects) raise an 828 exception. 829 830 The API returns NULL in case of an error. The caller is responsible 831 for decref'ing the returned objects. 832 833*/ 834 835PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 836 register PyObject *obj, /* Object */ 837 const char *encoding, /* encoding */ 838 const char *errors /* error handling */ 839 ); 840 841/* Coerce obj to an Unicode object and return a reference with 842 *incremented* refcount. 843 844 Unicode objects are passed back as-is (subclasses are converted to 845 true Unicode objects), all other objects are delegated to 846 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 847 using UTF-8 encoding as basis for decoding the object. 848 849 The API returns NULL in case of an error. The caller is responsible 850 for decref'ing the returned objects. 851 852*/ 853 854PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 855 register PyObject *obj /* Object */ 856 ); 857 858PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 859 const char *format, /* ASCII-encoded string */ 860 va_list vargs 861 ); 862PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 863 const char *format, /* ASCII-encoded string */ 864 ... 865 ); 866 867#ifndef Py_LIMITED_API 868/* Format the object based on the format_spec, as defined in PEP 3101 869 (Advanced String Formatting). */ 870PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj, 871 PyObject *format_spec, 872 Py_ssize_t start, 873 Py_ssize_t end); 874#endif 875 876PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 877PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 878PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 879 const char *u /* UTF-8 encoded string */ 880 ); 881#ifndef Py_LIMITED_API 882PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 883#endif 884 885/* Use only if you know it's a string */ 886#define PyUnicode_CHECK_INTERNED(op) \ 887 (((PyASCIIObject *)(op))->state.interned) 888 889/* --- wchar_t support for platforms which support it --------------------- */ 890 891#ifdef HAVE_WCHAR_H 892 893/* Create a Unicode Object from the wchar_t buffer w of the given 894 size. 895 896 The buffer is copied into the new object. */ 897 898PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 899 register const wchar_t *w, /* wchar_t buffer */ 900 Py_ssize_t size /* size of buffer */ 901 ); 902 903/* Copies the Unicode Object contents into the wchar_t buffer w. At 904 most size wchar_t characters are copied. 905 906 Note that the resulting wchar_t string may or may not be 907 0-terminated. It is the responsibility of the caller to make sure 908 that the wchar_t string is 0-terminated in case this is required by 909 the application. 910 911 Returns the number of wchar_t characters copied (excluding a 912 possibly trailing 0-termination character) or -1 in case of an 913 error. */ 914 915PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 916 PyObject *unicode, /* Unicode object */ 917 register wchar_t *w, /* wchar_t buffer */ 918 Py_ssize_t size /* size of buffer */ 919 ); 920 921/* Convert the Unicode object to a wide character string. The output string 922 always ends with a nul character. If size is not NULL, write the number of 923 wide characters (excluding the null character) into *size. 924 925 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) 926 on success. On error, returns NULL, *size is undefined and raises a 927 MemoryError. */ 928 929PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 930 PyObject *unicode, /* Unicode object */ 931 Py_ssize_t *size /* number of characters of the result */ 932 ); 933 934#ifndef Py_LIMITED_API 935PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); 936#endif 937 938#endif 939 940/* --- Unicode ordinals --------------------------------------------------- */ 941 942/* Create a Unicode Object from the given Unicode code point ordinal. 943 944 The ordinal must be in range(0x10000) on narrow Python builds 945 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 946 raised in case it is not. 947 948*/ 949 950PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 951 952/* --- Free-list management ----------------------------------------------- */ 953 954/* Clear the free list used by the Unicode implementation. 955 956 This can be used to release memory used for objects on the free 957 list back to the Python memory allocator. 958 959*/ 960 961PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 962 963/* === Builtin Codecs ===================================================== 964 965 Many of these APIs take two arguments encoding and errors. These 966 parameters encoding and errors have the same semantics as the ones 967 of the builtin str() API. 968 969 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 970 971 Error handling is set by errors which may also be set to NULL 972 meaning to use the default handling defined for the codec. Default 973 error handling for all builtin codecs is "strict" (ValueErrors are 974 raised). 975 976 The codecs all use a similar interface. Only deviation from the 977 generic ones are documented. 978 979*/ 980 981/* --- Manage the default encoding ---------------------------------------- */ 982 983/* Returns a pointer to the default encoding (UTF-8) of the 984 Unicode object unicode and the size of the encoded representation 985 in bytes stored in *size. 986 987 In case of an error, no *size is set. 988 989 This function caches the UTF-8 encoded string in the unicodeobject 990 and subsequent calls will return the same string. The memory is released 991 when the unicodeobject is deallocated. 992 993 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to 994 support the previous internal function with the same behaviour. 995 996 *** This API is for interpreter INTERNAL USE ONLY and will likely 997 *** be removed or changed in the future. 998 999 *** If you need to access the Unicode object as UTF-8 bytes string, 1000 *** please use PyUnicode_AsUTF8String() instead. 1001*/ 1002 1003#ifndef Py_LIMITED_API 1004PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( 1005 PyObject *unicode, 1006 Py_ssize_t *size); 1007#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize 1008#endif 1009 1010/* Returns a pointer to the default encoding (UTF-8) of the 1011 Unicode object unicode. 1012 1013 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 1014 in the unicodeobject. 1015 1016 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 1017 support the previous internal function with the same behaviour. 1018 1019 Use of this API is DEPRECATED since no size information can be 1020 extracted from the returned data. 1021 1022 *** This API is for interpreter INTERNAL USE ONLY and will likely 1023 *** be removed or changed for Python 3.1. 1024 1025 *** If you need to access the Unicode object as UTF-8 bytes string, 1026 *** please use PyUnicode_AsUTF8String() instead. 1027 1028*/ 1029 1030#ifndef Py_LIMITED_API 1031PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); 1032#define _PyUnicode_AsString PyUnicode_AsUTF8 1033#endif 1034 1035/* Returns "utf-8". */ 1036 1037PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 1038 1039/* --- Generic Codecs ----------------------------------------------------- */ 1040 1041/* Create a Unicode object by decoding the encoded string s of the 1042 given size. */ 1043 1044PyAPI_FUNC(PyObject*) PyUnicode_Decode( 1045 const char *s, /* encoded string */ 1046 Py_ssize_t size, /* size of buffer */ 1047 const char *encoding, /* encoding */ 1048 const char *errors /* error handling */ 1049 ); 1050 1051/* Decode a Unicode object unicode and return the result as Python 1052 object. */ 1053 1054PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 1055 PyObject *unicode, /* Unicode object */ 1056 const char *encoding, /* encoding */ 1057 const char *errors /* error handling */ 1058 ); 1059 1060/* Decode a Unicode object unicode and return the result as Unicode 1061 object. */ 1062 1063PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 1064 PyObject *unicode, /* Unicode object */ 1065 const char *encoding, /* encoding */ 1066 const char *errors /* error handling */ 1067 ); 1068 1069/* Encodes a Py_UNICODE buffer of the given size and returns a 1070 Python string object. */ 1071 1072#ifndef Py_LIMITED_API 1073PyAPI_FUNC(PyObject*) PyUnicode_Encode( 1074 const Py_UNICODE *s, /* Unicode char buffer */ 1075 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 1076 const char *encoding, /* encoding */ 1077 const char *errors /* error handling */ 1078 ); 1079#endif 1080 1081/* Encodes a Unicode object and returns the result as Python 1082 object. */ 1083 1084PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 1085 PyObject *unicode, /* Unicode object */ 1086 const char *encoding, /* encoding */ 1087 const char *errors /* error handling */ 1088 ); 1089 1090/* Encodes a Unicode object and returns the result as Python string 1091 object. */ 1092 1093PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 1094 PyObject *unicode, /* Unicode object */ 1095 const char *encoding, /* encoding */ 1096 const char *errors /* error handling */ 1097 ); 1098 1099/* Encodes a Unicode object and returns the result as Unicode 1100 object. */ 1101 1102PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 1103 PyObject *unicode, /* Unicode object */ 1104 const char *encoding, /* encoding */ 1105 const char *errors /* error handling */ 1106 ); 1107 1108/* Build an encoding map. */ 1109 1110PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 1111 PyObject* string /* 256 character map */ 1112 ); 1113 1114/* --- UTF-7 Codecs ------------------------------------------------------- */ 1115 1116PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 1117 const char *string, /* UTF-7 encoded string */ 1118 Py_ssize_t length, /* size of string */ 1119 const char *errors /* error handling */ 1120 ); 1121 1122PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 1123 const char *string, /* UTF-7 encoded string */ 1124 Py_ssize_t length, /* size of string */ 1125 const char *errors, /* error handling */ 1126 Py_ssize_t *consumed /* bytes consumed */ 1127 ); 1128 1129#ifndef Py_LIMITED_API 1130PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 1131 const Py_UNICODE *data, /* Unicode char buffer */ 1132 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1133 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1134 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1135 const char *errors /* error handling */ 1136 ); 1137PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( 1138 PyObject *unicode, /* Unicode object */ 1139 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1140 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1141 const char *errors /* error handling */ 1142 ); 1143#endif 1144 1145/* --- UTF-8 Codecs ------------------------------------------------------- */ 1146 1147PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 1148 const char *string, /* UTF-8 encoded string */ 1149 Py_ssize_t length, /* size of string */ 1150 const char *errors /* error handling */ 1151 ); 1152 1153PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 1154 const char *string, /* UTF-8 encoded string */ 1155 Py_ssize_t length, /* size of string */ 1156 const char *errors, /* error handling */ 1157 Py_ssize_t *consumed /* bytes consumed */ 1158 ); 1159 1160PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 1161 PyObject *unicode /* Unicode object */ 1162 ); 1163 1164#ifndef Py_LIMITED_API 1165PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 1166 PyObject *unicode, 1167 const char *errors); 1168 1169PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 1170 const Py_UNICODE *data, /* Unicode char buffer */ 1171 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1172 const char *errors /* error handling */ 1173 ); 1174#endif 1175 1176/* --- UTF-32 Codecs ------------------------------------------------------ */ 1177 1178/* Decodes length bytes from a UTF-32 encoded buffer string and returns 1179 the corresponding Unicode object. 1180 1181 errors (if non-NULL) defines the error handling. It defaults 1182 to "strict". 1183 1184 If byteorder is non-NULL, the decoder starts decoding using the 1185 given byte order: 1186 1187 *byteorder == -1: little endian 1188 *byteorder == 0: native order 1189 *byteorder == 1: big endian 1190 1191 In native mode, the first four bytes of the stream are checked for a 1192 BOM mark. If found, the BOM mark is analysed, the byte order 1193 adjusted and the BOM skipped. In the other modes, no BOM mark 1194 interpretation is done. After completion, *byteorder is set to the 1195 current byte order at the end of input data. 1196 1197 If byteorder is NULL, the codec starts in native order mode. 1198 1199*/ 1200 1201PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 1202 const char *string, /* UTF-32 encoded string */ 1203 Py_ssize_t length, /* size of string */ 1204 const char *errors, /* error handling */ 1205 int *byteorder /* pointer to byteorder to use 1206 0=native;-1=LE,1=BE; updated on 1207 exit */ 1208 ); 1209 1210PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 1211 const char *string, /* UTF-32 encoded string */ 1212 Py_ssize_t length, /* size of string */ 1213 const char *errors, /* error handling */ 1214 int *byteorder, /* pointer to byteorder to use 1215 0=native;-1=LE,1=BE; updated on 1216 exit */ 1217 Py_ssize_t *consumed /* bytes consumed */ 1218 ); 1219 1220/* Returns a Python string using the UTF-32 encoding in native byte 1221 order. The string always starts with a BOM mark. */ 1222 1223PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 1224 PyObject *unicode /* Unicode object */ 1225 ); 1226 1227/* Returns a Python string object holding the UTF-32 encoded value of 1228 the Unicode data. 1229 1230 If byteorder is not 0, output is written according to the following 1231 byte order: 1232 1233 byteorder == -1: little endian 1234 byteorder == 0: native byte order (writes a BOM mark) 1235 byteorder == 1: big endian 1236 1237 If byteorder is 0, the output string will always start with the 1238 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1239 prepended. 1240 1241*/ 1242 1243#ifndef Py_LIMITED_API 1244PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 1245 const Py_UNICODE *data, /* Unicode char buffer */ 1246 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1247 const char *errors, /* error handling */ 1248 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1249 ); 1250PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( 1251 PyObject *object, /* Unicode object */ 1252 const char *errors, /* error handling */ 1253 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1254 ); 1255#endif 1256 1257/* --- UTF-16 Codecs ------------------------------------------------------ */ 1258 1259/* Decodes length bytes from a UTF-16 encoded buffer string and returns 1260 the corresponding Unicode object. 1261 1262 errors (if non-NULL) defines the error handling. It defaults 1263 to "strict". 1264 1265 If byteorder is non-NULL, the decoder starts decoding using the 1266 given byte order: 1267 1268 *byteorder == -1: little endian 1269 *byteorder == 0: native order 1270 *byteorder == 1: big endian 1271 1272 In native mode, the first two bytes of the stream are checked for a 1273 BOM mark. If found, the BOM mark is analysed, the byte order 1274 adjusted and the BOM skipped. In the other modes, no BOM mark 1275 interpretation is done. After completion, *byteorder is set to the 1276 current byte order at the end of input data. 1277 1278 If byteorder is NULL, the codec starts in native order mode. 1279 1280*/ 1281 1282PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 1283 const char *string, /* UTF-16 encoded string */ 1284 Py_ssize_t length, /* size of string */ 1285 const char *errors, /* error handling */ 1286 int *byteorder /* pointer to byteorder to use 1287 0=native;-1=LE,1=BE; updated on 1288 exit */ 1289 ); 1290 1291PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 1292 const char *string, /* UTF-16 encoded string */ 1293 Py_ssize_t length, /* size of string */ 1294 const char *errors, /* error handling */ 1295 int *byteorder, /* pointer to byteorder to use 1296 0=native;-1=LE,1=BE; updated on 1297 exit */ 1298 Py_ssize_t *consumed /* bytes consumed */ 1299 ); 1300 1301/* Returns a Python string using the UTF-16 encoding in native byte 1302 order. The string always starts with a BOM mark. */ 1303 1304PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 1305 PyObject *unicode /* Unicode object */ 1306 ); 1307 1308/* Returns a Python string object holding the UTF-16 encoded value of 1309 the Unicode data. 1310 1311 If byteorder is not 0, output is written according to the following 1312 byte order: 1313 1314 byteorder == -1: little endian 1315 byteorder == 0: native byte order (writes a BOM mark) 1316 byteorder == 1: big endian 1317 1318 If byteorder is 0, the output string will always start with the 1319 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1320 prepended. 1321 1322 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 1323 UCS-2. This trick makes it possible to add full UTF-16 capabilities 1324 at a later point without compromising the APIs. 1325 1326*/ 1327 1328#ifndef Py_LIMITED_API 1329PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 1330 const Py_UNICODE *data, /* Unicode char buffer */ 1331 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1332 const char *errors, /* error handling */ 1333 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1334 ); 1335PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( 1336 PyObject* unicode, /* Unicode object */ 1337 const char *errors, /* error handling */ 1338 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1339 ); 1340#endif 1341 1342/* --- Unicode-Escape Codecs ---------------------------------------------- */ 1343 1344PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1345 const char *string, /* Unicode-Escape encoded string */ 1346 Py_ssize_t length, /* size of string */ 1347 const char *errors /* error handling */ 1348 ); 1349 1350PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1351 PyObject *unicode /* Unicode object */ 1352 ); 1353 1354#ifndef Py_LIMITED_API 1355PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1356 const Py_UNICODE *data, /* Unicode char buffer */ 1357 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1358 ); 1359#endif 1360 1361/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1362 1363PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1364 const char *string, /* Raw-Unicode-Escape encoded string */ 1365 Py_ssize_t length, /* size of string */ 1366 const char *errors /* error handling */ 1367 ); 1368 1369PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1370 PyObject *unicode /* Unicode object */ 1371 ); 1372 1373#ifndef Py_LIMITED_API 1374PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1375 const Py_UNICODE *data, /* Unicode char buffer */ 1376 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1377 ); 1378#endif 1379 1380/* --- Unicode Internal Codec --------------------------------------------- 1381 1382 Only for internal use in _codecsmodule.c */ 1383 1384#ifndef Py_LIMITED_API 1385PyObject *_PyUnicode_DecodeUnicodeInternal( 1386 const char *string, 1387 Py_ssize_t length, 1388 const char *errors 1389 ); 1390#endif 1391 1392/* --- Latin-1 Codecs ----------------------------------------------------- 1393 1394 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1395 1396*/ 1397 1398PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1399 const char *string, /* Latin-1 encoded string */ 1400 Py_ssize_t length, /* size of string */ 1401 const char *errors /* error handling */ 1402 ); 1403 1404PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1405 PyObject *unicode /* Unicode object */ 1406 ); 1407 1408#ifndef Py_LIMITED_API 1409PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 1410 PyObject* unicode, 1411 const char* errors); 1412 1413PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1414 const Py_UNICODE *data, /* Unicode char buffer */ 1415 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1416 const char *errors /* error handling */ 1417 ); 1418#endif 1419 1420/* --- ASCII Codecs ------------------------------------------------------- 1421 1422 Only 7-bit ASCII data is excepted. All other codes generate errors. 1423 1424*/ 1425 1426PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1427 const char *string, /* ASCII encoded string */ 1428 Py_ssize_t length, /* size of string */ 1429 const char *errors /* error handling */ 1430 ); 1431 1432PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1433 PyObject *unicode /* Unicode object */ 1434 ); 1435 1436#ifndef Py_LIMITED_API 1437PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 1438 PyObject* unicode, 1439 const char* errors); 1440 1441PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1442 const Py_UNICODE *data, /* Unicode char buffer */ 1443 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1444 const char *errors /* error handling */ 1445 ); 1446#endif 1447 1448/* --- Character Map Codecs ----------------------------------------------- 1449 1450 This codec uses mappings to encode and decode characters. 1451 1452 Decoding mappings must map single string characters to single 1453 Unicode characters, integers (which are then interpreted as Unicode 1454 ordinals) or None (meaning "undefined mapping" and causing an 1455 error). 1456 1457 Encoding mappings must map single Unicode characters to single 1458 string characters, integers (which are then interpreted as Latin-1 1459 ordinals) or None (meaning "undefined mapping" and causing an 1460 error). 1461 1462 If a character lookup fails with a LookupError, the character is 1463 copied as-is meaning that its ordinal value will be interpreted as 1464 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1465 to contain those mappings which map characters to different code 1466 points. 1467 1468*/ 1469 1470PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1471 const char *string, /* Encoded string */ 1472 Py_ssize_t length, /* size of string */ 1473 PyObject *mapping, /* character mapping 1474 (char ordinal -> unicode ordinal) */ 1475 const char *errors /* error handling */ 1476 ); 1477 1478PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1479 PyObject *unicode, /* Unicode object */ 1480 PyObject *mapping /* character mapping 1481 (unicode ordinal -> char ordinal) */ 1482 ); 1483 1484#ifndef Py_LIMITED_API 1485PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1486 const Py_UNICODE *data, /* Unicode char buffer */ 1487 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1488 PyObject *mapping, /* character mapping 1489 (unicode ordinal -> char ordinal) */ 1490 const char *errors /* error handling */ 1491 ); 1492PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( 1493 PyObject *unicode, /* Unicode object */ 1494 PyObject *mapping, /* character mapping 1495 (unicode ordinal -> char ordinal) */ 1496 const char *errors /* error handling */ 1497 ); 1498#endif 1499 1500/* Translate a Py_UNICODE buffer of the given length by applying a 1501 character mapping table to it and return the resulting Unicode 1502 object. 1503 1504 The mapping table must map Unicode ordinal integers to Unicode 1505 ordinal integers or None (causing deletion of the character). 1506 1507 Mapping tables may be dictionaries or sequences. Unmapped character 1508 ordinals (ones which cause a LookupError) are left untouched and 1509 are copied as-is. 1510 1511*/ 1512 1513#ifndef Py_LIMITED_API 1514PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1515 const Py_UNICODE *data, /* Unicode char buffer */ 1516 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1517 PyObject *table, /* Translate table */ 1518 const char *errors /* error handling */ 1519 ); 1520#endif 1521 1522#ifdef HAVE_MBCS 1523 1524/* --- MBCS codecs for Windows -------------------------------------------- */ 1525 1526PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1527 const char *string, /* MBCS encoded string */ 1528 Py_ssize_t length, /* size of string */ 1529 const char *errors /* error handling */ 1530 ); 1531 1532PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1533 const char *string, /* MBCS encoded string */ 1534 Py_ssize_t length, /* size of string */ 1535 const char *errors, /* error handling */ 1536 Py_ssize_t *consumed /* bytes consumed */ 1537 ); 1538 1539PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 1540 int code_page, /* code page number */ 1541 const char *string, /* encoded string */ 1542 Py_ssize_t length, /* size of string */ 1543 const char *errors, /* error handling */ 1544 Py_ssize_t *consumed /* bytes consumed */ 1545 ); 1546 1547PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1548 PyObject *unicode /* Unicode object */ 1549 ); 1550 1551#ifndef Py_LIMITED_API 1552PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1553 const Py_UNICODE *data, /* Unicode char buffer */ 1554 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1555 const char *errors /* error handling */ 1556 ); 1557#endif 1558 1559PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 1560 int code_page, /* code page number */ 1561 PyObject *unicode, /* Unicode object */ 1562 const char *errors /* error handling */ 1563 ); 1564 1565#endif /* HAVE_MBCS */ 1566 1567/* --- Decimal Encoder ---------------------------------------------------- */ 1568 1569/* Takes a Unicode string holding a decimal value and writes it into 1570 an output buffer using standard ASCII digit codes. 1571 1572 The output buffer has to provide at least length+1 bytes of storage 1573 area. The output string is 0-terminated. 1574 1575 The encoder converts whitespace to ' ', decimal characters to their 1576 corresponding ASCII digit and all other Latin-1 characters except 1577 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1578 are treated as errors. This includes embedded NULL bytes. 1579 1580 Error handling is defined by the errors argument: 1581 1582 NULL or "strict": raise a ValueError 1583 "ignore": ignore the wrong characters (these are not copied to the 1584 output buffer) 1585 "replace": replaces illegal characters with '?' 1586 1587 Returns 0 on success, -1 on failure. 1588 1589*/ 1590 1591#ifndef Py_LIMITED_API 1592PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1593 Py_UNICODE *s, /* Unicode buffer */ 1594 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1595 char *output, /* Output buffer; must have size >= length */ 1596 const char *errors /* error handling */ 1597 ); 1598#endif 1599 1600/* Transforms code points that have decimal digit property to the 1601 corresponding ASCII digit code points. 1602 1603 Returns a new Unicode string on success, NULL on failure. 1604*/ 1605 1606#ifndef Py_LIMITED_API 1607PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( 1608 Py_UNICODE *s, /* Unicode buffer */ 1609 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ 1610 ); 1611#endif 1612 1613/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject 1614 as argument instead of a raw buffer and length. This function additionally 1615 transforms spaces to ASCII because this is what the callers in longobject, 1616 floatobject, and complexobject did anyways. */ 1617 1618#ifndef Py_LIMITED_API 1619PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 1620 PyObject *unicode /* Unicode object */ 1621 ); 1622#endif 1623 1624/* --- Locale encoding --------------------------------------------------- */ 1625 1626/* Decode a string from the current locale encoding. The decoder is strict if 1627 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 1628 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 1629 be decoded as a surrogate character and *surrogateescape* is not equal to 1630 zero, the byte sequence is escaped using the 'surrogateescape' error handler 1631 instead of being decoded. *str* must end with a null character but cannot 1632 contain embedded null characters. */ 1633 1634PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 1635 const char *str, 1636 Py_ssize_t len, 1637 const char *errors); 1638 1639/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 1640 length using strlen(). */ 1641 1642PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 1643 const char *str, 1644 const char *errors); 1645 1646/* Encode a Unicode object to the current locale encoding. The encoder is 1647 strict is *surrogateescape* is equal to zero, otherwise the 1648 "surrogateescape" error handler is used. Return a bytes object. The string 1649 cannot contain embedded null characters.. */ 1650 1651PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 1652 PyObject *unicode, 1653 const char *errors 1654 ); 1655 1656/* --- File system encoding ---------------------------------------------- */ 1657 1658/* ParseTuple converter: encode str objects to bytes using 1659 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1660 1661PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1662 1663/* ParseTuple converter: decode bytes objects to unicode using 1664 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1665 1666PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1667 1668/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1669 and the "surrogateescape" error handler. 1670 1671 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1672 encoding. 1673 1674 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1675*/ 1676 1677PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1678 const char *s /* encoded string */ 1679 ); 1680 1681/* Decode a string using Py_FileSystemDefaultEncoding 1682 and the "surrogateescape" error handler. 1683 1684 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1685 encoding. 1686*/ 1687 1688PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1689 const char *s, /* encoded string */ 1690 Py_ssize_t size /* size */ 1691 ); 1692 1693/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1694 "surrogateescape" error handler, and return bytes. 1695 1696 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1697 encoding. 1698*/ 1699 1700PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1701 PyObject *unicode 1702 ); 1703 1704/* --- Methods & Slots ---------------------------------------------------- 1705 1706 These are capable of handling Unicode objects and strings on input 1707 (we refer to them as strings in the descriptions) and return 1708 Unicode objects or integers as appropriate. */ 1709 1710/* Concat two strings giving a new Unicode string. */ 1711 1712PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1713 PyObject *left, /* Left string */ 1714 PyObject *right /* Right string */ 1715 ); 1716 1717/* Concat two strings and put the result in *pleft 1718 (sets *pleft to NULL on error) */ 1719 1720PyAPI_FUNC(void) PyUnicode_Append( 1721 PyObject **pleft, /* Pointer to left string */ 1722 PyObject *right /* Right string */ 1723 ); 1724 1725/* Concat two strings, put the result in *pleft and drop the right object 1726 (sets *pleft to NULL on error) */ 1727 1728PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1729 PyObject **pleft, /* Pointer to left string */ 1730 PyObject *right /* Right string */ 1731 ); 1732 1733/* Split a string giving a list of Unicode strings. 1734 1735 If sep is NULL, splitting will be done at all whitespace 1736 substrings. Otherwise, splits occur at the given separator. 1737 1738 At most maxsplit splits will be done. If negative, no limit is set. 1739 1740 Separators are not included in the resulting list. 1741 1742*/ 1743 1744PyAPI_FUNC(PyObject*) PyUnicode_Split( 1745 PyObject *s, /* String to split */ 1746 PyObject *sep, /* String separator */ 1747 Py_ssize_t maxsplit /* Maxsplit count */ 1748 ); 1749 1750/* Dito, but split at line breaks. 1751 1752 CRLF is considered to be one line break. Line breaks are not 1753 included in the resulting list. */ 1754 1755PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1756 PyObject *s, /* String to split */ 1757 int keepends /* If true, line end markers are included */ 1758 ); 1759 1760/* Partition a string using a given separator. */ 1761 1762PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1763 PyObject *s, /* String to partition */ 1764 PyObject *sep /* String separator */ 1765 ); 1766 1767/* Partition a string using a given separator, searching from the end of the 1768 string. */ 1769 1770PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1771 PyObject *s, /* String to partition */ 1772 PyObject *sep /* String separator */ 1773 ); 1774 1775/* Split a string giving a list of Unicode strings. 1776 1777 If sep is NULL, splitting will be done at all whitespace 1778 substrings. Otherwise, splits occur at the given separator. 1779 1780 At most maxsplit splits will be done. But unlike PyUnicode_Split 1781 PyUnicode_RSplit splits from the end of the string. If negative, 1782 no limit is set. 1783 1784 Separators are not included in the resulting list. 1785 1786*/ 1787 1788PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1789 PyObject *s, /* String to split */ 1790 PyObject *sep, /* String separator */ 1791 Py_ssize_t maxsplit /* Maxsplit count */ 1792 ); 1793 1794/* Translate a string by applying a character mapping table to it and 1795 return the resulting Unicode object. 1796 1797 The mapping table must map Unicode ordinal integers to Unicode 1798 ordinal integers or None (causing deletion of the character). 1799 1800 Mapping tables may be dictionaries or sequences. Unmapped character 1801 ordinals (ones which cause a LookupError) are left untouched and 1802 are copied as-is. 1803 1804*/ 1805 1806PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1807 PyObject *str, /* String */ 1808 PyObject *table, /* Translate table */ 1809 const char *errors /* error handling */ 1810 ); 1811 1812/* Join a sequence of strings using the given separator and return 1813 the resulting Unicode string. */ 1814 1815PyAPI_FUNC(PyObject*) PyUnicode_Join( 1816 PyObject *separator, /* Separator string */ 1817 PyObject *seq /* Sequence object */ 1818 ); 1819 1820/* Return 1 if substr matches str[start:end] at the given tail end, 0 1821 otherwise. */ 1822 1823PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1824 PyObject *str, /* String */ 1825 PyObject *substr, /* Prefix or Suffix string */ 1826 Py_ssize_t start, /* Start index */ 1827 Py_ssize_t end, /* Stop index */ 1828 int direction /* Tail end: -1 prefix, +1 suffix */ 1829 ); 1830 1831/* Return the first position of substr in str[start:end] using the 1832 given search direction or -1 if not found. -2 is returned in case 1833 an error occurred and an exception is set. */ 1834 1835PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1836 PyObject *str, /* String */ 1837 PyObject *substr, /* Substring to find */ 1838 Py_ssize_t start, /* Start index */ 1839 Py_ssize_t end, /* Stop index */ 1840 int direction /* Find direction: +1 forward, -1 backward */ 1841 ); 1842 1843/* Like PyUnicode_Find, but search for single character only. */ 1844PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 1845 PyObject *str, 1846 Py_UCS4 ch, 1847 Py_ssize_t start, 1848 Py_ssize_t end, 1849 int direction 1850 ); 1851 1852/* Count the number of occurrences of substr in str[start:end]. */ 1853 1854PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1855 PyObject *str, /* String */ 1856 PyObject *substr, /* Substring to count */ 1857 Py_ssize_t start, /* Start index */ 1858 Py_ssize_t end /* Stop index */ 1859 ); 1860 1861/* Replace at most maxcount occurrences of substr in str with replstr 1862 and return the resulting Unicode object. */ 1863 1864PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1865 PyObject *str, /* String */ 1866 PyObject *substr, /* Substring to find */ 1867 PyObject *replstr, /* Substring to replace */ 1868 Py_ssize_t maxcount /* Max. number of replacements to apply; 1869 -1 = all */ 1870 ); 1871 1872/* Compare two strings and return -1, 0, 1 for less than, equal, 1873 greater than resp. */ 1874 1875PyAPI_FUNC(int) PyUnicode_Compare( 1876 PyObject *left, /* Left string */ 1877 PyObject *right /* Right string */ 1878 ); 1879 1880PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1881 PyObject *left, 1882 const char *right /* ASCII-encoded string */ 1883 ); 1884 1885/* Rich compare two strings and return one of the following: 1886 1887 - NULL in case an exception was raised 1888 - Py_True or Py_False for successfully comparisons 1889 - Py_NotImplemented in case the type combination is unknown 1890 1891 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1892 case the conversion of the arguments to Unicode fails with a 1893 UnicodeDecodeError. 1894 1895 Possible values for op: 1896 1897 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1898 1899*/ 1900 1901PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1902 PyObject *left, /* Left string */ 1903 PyObject *right, /* Right string */ 1904 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1905 ); 1906 1907/* Apply a argument tuple or dictionary to a format string and return 1908 the resulting Unicode string. */ 1909 1910PyAPI_FUNC(PyObject *) PyUnicode_Format( 1911 PyObject *format, /* Format string */ 1912 PyObject *args /* Argument tuple or dictionary */ 1913 ); 1914 1915/* Checks whether element is contained in container and return 1/0 1916 accordingly. 1917 1918 element has to coerce to an one element Unicode string. -1 is 1919 returned in case of an error. */ 1920 1921PyAPI_FUNC(int) PyUnicode_Contains( 1922 PyObject *container, /* Container string */ 1923 PyObject *element /* Element string */ 1924 ); 1925 1926/* Checks whether the string contains any NUL characters. */ 1927 1928#ifndef Py_LIMITED_API 1929PyAPI_FUNC(int) _PyUnicode_HasNULChars(PyObject *); 1930#endif 1931 1932/* Checks whether argument is a valid identifier. */ 1933 1934PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1935 1936#ifndef Py_LIMITED_API 1937/* Externally visible for str.strip(unicode) */ 1938PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1939 PyObject *self, 1940 int striptype, 1941 PyObject *sepobj 1942 ); 1943#endif 1944 1945/* Using explicit passed-in values, insert the thousands grouping 1946 into the string pointed to by buffer. For the argument descriptions, 1947 see Objects/stringlib/localeutil.h */ 1948#ifndef Py_LIMITED_API 1949PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 1950 PyObject *unicode, 1951 Py_ssize_t index, 1952 Py_ssize_t n_buffer, 1953 void *digits, 1954 Py_ssize_t n_digits, 1955 Py_ssize_t min_width, 1956 const char *grouping, 1957 PyObject *thousands_sep, 1958 Py_UCS4 *maxchar); 1959#endif 1960/* === Characters Type APIs =============================================== */ 1961 1962/* Helper array used by Py_UNICODE_ISSPACE(). */ 1963 1964#ifndef Py_LIMITED_API 1965PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 1966 1967/* These should not be used directly. Use the Py_UNICODE_IS* and 1968 Py_UNICODE_TO* macros instead. 1969 1970 These APIs are implemented in Objects/unicodectype.c. 1971 1972*/ 1973 1974PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1975 Py_UCS4 ch /* Unicode character */ 1976 ); 1977 1978PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1979 Py_UCS4 ch /* Unicode character */ 1980 ); 1981 1982PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1983 Py_UCS4 ch /* Unicode character */ 1984 ); 1985 1986PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1987 Py_UCS4 ch /* Unicode character */ 1988 ); 1989 1990PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1991 Py_UCS4 ch /* Unicode character */ 1992 ); 1993 1994PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1995 const Py_UCS4 ch /* Unicode character */ 1996 ); 1997 1998PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1999 const Py_UCS4 ch /* Unicode character */ 2000 ); 2001 2002PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 2003 Py_UCS4 ch /* Unicode character */ 2004 ); 2005 2006PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 2007 Py_UCS4 ch /* Unicode character */ 2008 ); 2009 2010PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 2011 Py_UCS4 ch /* Unicode character */ 2012 ); 2013 2014PyAPI_FUNC(int) _PyUnicode_ToLowerFull( 2015 Py_UCS4 ch, /* Unicode character */ 2016 Py_UCS4 *res 2017 ); 2018 2019PyAPI_FUNC(int) _PyUnicode_ToTitleFull( 2020 Py_UCS4 ch, /* Unicode character */ 2021 Py_UCS4 *res 2022 ); 2023 2024PyAPI_FUNC(int) _PyUnicode_ToUpperFull( 2025 Py_UCS4 ch, /* Unicode character */ 2026 Py_UCS4 *res 2027 ); 2028 2029PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( 2030 Py_UCS4 ch, /* Unicode character */ 2031 Py_UCS4 *res 2032 ); 2033 2034PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( 2035 Py_UCS4 ch /* Unicode character */ 2036 ); 2037 2038PyAPI_FUNC(int) _PyUnicode_IsCased( 2039 Py_UCS4 ch /* Unicode character */ 2040 ); 2041 2042PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 2043 Py_UCS4 ch /* Unicode character */ 2044 ); 2045 2046PyAPI_FUNC(int) _PyUnicode_ToDigit( 2047 Py_UCS4 ch /* Unicode character */ 2048 ); 2049 2050PyAPI_FUNC(double) _PyUnicode_ToNumeric( 2051 Py_UCS4 ch /* Unicode character */ 2052 ); 2053 2054PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 2055 Py_UCS4 ch /* Unicode character */ 2056 ); 2057 2058PyAPI_FUNC(int) _PyUnicode_IsDigit( 2059 Py_UCS4 ch /* Unicode character */ 2060 ); 2061 2062PyAPI_FUNC(int) _PyUnicode_IsNumeric( 2063 Py_UCS4 ch /* Unicode character */ 2064 ); 2065 2066PyAPI_FUNC(int) _PyUnicode_IsPrintable( 2067 Py_UCS4 ch /* Unicode character */ 2068 ); 2069 2070PyAPI_FUNC(int) _PyUnicode_IsAlpha( 2071 Py_UCS4 ch /* Unicode character */ 2072 ); 2073 2074PyAPI_FUNC(size_t) Py_UNICODE_strlen( 2075 const Py_UNICODE *u 2076 ); 2077 2078PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 2079 Py_UNICODE *s1, 2080 const Py_UNICODE *s2); 2081 2082PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 2083 Py_UNICODE *s1, const Py_UNICODE *s2); 2084 2085PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 2086 Py_UNICODE *s1, 2087 const Py_UNICODE *s2, 2088 size_t n); 2089 2090PyAPI_FUNC(int) Py_UNICODE_strcmp( 2091 const Py_UNICODE *s1, 2092 const Py_UNICODE *s2 2093 ); 2094 2095PyAPI_FUNC(int) Py_UNICODE_strncmp( 2096 const Py_UNICODE *s1, 2097 const Py_UNICODE *s2, 2098 size_t n 2099 ); 2100 2101PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 2102 const Py_UNICODE *s, 2103 Py_UNICODE c 2104 ); 2105 2106PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 2107 const Py_UNICODE *s, 2108 Py_UNICODE c 2109 ); 2110 2111/* Create a copy of a unicode string ending with a nul character. Return NULL 2112 and raise a MemoryError exception on memory allocation failure, otherwise 2113 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 2114 2115PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 2116 PyObject *unicode 2117 ); 2118#endif /* Py_LIMITED_API */ 2119 2120#if defined(Py_DEBUG) && !defined(Py_LIMITED_API) 2121PyAPI_FUNC(int) _PyUnicode_CheckConsistency( 2122 PyObject *op, 2123 int check_content); 2124#endif 2125 2126/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ 2127PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); 2128/* Clear all static strings. */ 2129PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); 2130 2131#ifdef __cplusplus 2132} 2133#endif 2134#endif /* !Py_UNICODEOBJECT_H */ 2135