unicodeobject.h revision b066cc6aba07a118c89f2a127560858051af4814
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal. (See 11http://www.egenix.com/files/python/unicode-proposal.txt). 12 13Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58#include <ctype.h> 59 60/* === Internal API ======================================================= */ 61 62/* --- Internal Unicode Format -------------------------------------------- */ 63 64/* Python 3.x requires unicode */ 65#define Py_USING_UNICODE 66 67#ifndef SIZEOF_WCHAR_T 68#error Must define SIZEOF_WCHAR_T 69#endif 70 71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Py_UNICODE was the native Unicode storage format (code unit) used by 87 Python and represents a single Unicode element in the Unicode type. 88 With PEP 393, Py_UNICODE is deprecated and replaced with a 89 typedef to wchar_t. */ 90 91#ifndef Py_LIMITED_API 92#define PY_UNICODE_TYPE wchar_t 93typedef wchar_t Py_UNICODE; 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar(), 98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#if defined(MS_WINDOWS) 107# define HAVE_MBCS 108#endif 109 110#ifdef HAVE_WCHAR_H 111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 112# ifdef _HAVE_BSDI 113# include <time.h> 114# endif 115# include <wchar.h> 116#endif 117 118/* Py_UCS4 and Py_UCS2 are typedefs for the respective 119 unicode representations. */ 120#if SIZEOF_INT >= 4 121typedef unsigned int Py_UCS4; 122#elif SIZEOF_LONG >= 4 123typedef unsigned long Py_UCS4; 124#else 125#error "Could not find a proper typedef for Py_UCS4" 126#endif 127 128typedef unsigned short Py_UCS2; 129typedef unsigned char Py_UCS1; 130 131/* --- Internal Unicode Operations ---------------------------------------- */ 132 133/* Since splitting on whitespace is an important use case, and 134 whitespace in most situations is solely ASCII whitespace, we 135 optimize for the common case by using a quick look-up table 136 _Py_ascii_whitespace (see below) with an inlined check. 137 138 */ 139#ifndef Py_LIMITED_API 140#define Py_UNICODE_ISSPACE(ch) \ 141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 142 143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 147 148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 151 152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 156 157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 160 161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 162 163#define Py_UNICODE_ISALNUM(ch) \ 164 (Py_UNICODE_ISALPHA(ch) || \ 165 Py_UNICODE_ISDECIMAL(ch) || \ 166 Py_UNICODE_ISDIGIT(ch) || \ 167 Py_UNICODE_ISNUMERIC(ch)) 168 169#define Py_UNICODE_COPY(target, source, length) \ 170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 171 172#define Py_UNICODE_FILL(target, value, length) \ 173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 175 } while (0) 176 177/* macros to work with surrogates */ 178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF) 179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 181/* Join two surrogate characters and return a single Py_UCS4 value. */ 182#define Py_UNICODE_JOIN_SURROGATES(high, low) \ 183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 185 186/* Check if substring matches at given offset. The offset must be 187 valid, and the substring must not be empty. */ 188 189#define Py_UNICODE_MATCH(string, offset, substring) \ 190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ 191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ 192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) 193 194#endif /* Py_LIMITED_API */ 195 196#ifdef __cplusplus 197extern "C" { 198#endif 199 200/* --- Unicode Type ------------------------------------------------------- */ 201 202#ifndef Py_LIMITED_API 203 204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 205 structure. state.ascii and state.compact are set, and the data 206 immediately follow the structure. utf8_length and wstr_length can be found 207 in the length field; the utf8 pointer is equal to the data pointer. */ 208typedef struct { 209 /* There are 4 forms of Unicode strings: 210 211 - compact ascii: 212 213 * structure = PyASCIIObject 214 * kind = PyUnicode_1BYTE_KIND 215 * compact = 1 216 * ascii = 1 217 * ready = 1 218 * (length is the length of the utf8 and wstr strings) 219 * (data starts just after the structure) 220 * (since ASCII is decoded from UTF-8, the utf8 string are the data) 221 222 - compact: 223 224 * structure = PyCompactUnicodeObject 225 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 226 PyUnicode_4BYTE_KIND 227 * compact = 1 228 * ready = 1 229 * ascii = 0 230 * utf8 is not shared with data 231 * utf8_length = 0 if utf8 is NULL 232 * wstr is shared with data and wstr_length=length 233 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 234 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 235 * wstr_length = 0 if wstr is NULL 236 * (data starts just after the structure) 237 238 - legacy string, not ready: 239 240 * structure = PyUnicodeObject 241 * kind = PyUnicode_WCHAR_KIND 242 * compact = 0 243 * ascii = 0 244 * ready = 0 245 * wstr is not NULL 246 * data.any is NULL 247 * utf8 is NULL 248 * utf8_length = 0 249 * interned = SSTATE_NOT_INTERNED 250 251 - legacy string, ready: 252 253 * structure = PyUnicodeObject structure 254 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 255 PyUnicode_4BYTE_KIND 256 * compact = 0 257 * ready = 1 258 * data.any is not NULL 259 * utf8 is shared and utf8_length = length with data.any if ascii = 1 260 * utf8_length = 0 if utf8 is NULL 261 * wstr is shared and wstr_length = length with data.any 262 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 263 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 264 * wstr_length = 0 if wstr is NULL 265 266 Compact strings use only one memory block (structure + characters), 267 whereas legacy strings use one block for the structure and one block 268 for characters. 269 270 Legacy strings are created by PyUnicode_FromUnicode() and 271 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready 272 when PyUnicode_READY() is called. 273 274 See also _PyUnicode_CheckConsistency(). 275 */ 276 PyObject_HEAD 277 Py_ssize_t length; /* Number of code points in the string */ 278 Py_hash_t hash; /* Hash value; -1 if not set */ 279 struct { 280 /* 281 SSTATE_NOT_INTERNED (0) 282 SSTATE_INTERNED_MORTAL (1) 283 SSTATE_INTERNED_IMMORTAL (2) 284 285 If interned != SSTATE_NOT_INTERNED, the two references from the 286 dictionary to this object are *not* counted in ob_refcnt. 287 */ 288 unsigned int interned:2; 289 /* Character size: 290 291 - PyUnicode_WCHAR_KIND (0): 292 293 * character type = wchar_t (16 or 32 bits, depending on the 294 platform) 295 296 - PyUnicode_1BYTE_KIND (1): 297 298 * character type = Py_UCS1 (8 bits, unsigned) 299 * if ascii is set, all characters must be in range 300 U+0000-U+007F, otherwise at least one character must be in range 301 U+0080-U+00FF 302 303 - PyUnicode_2BYTE_KIND (2): 304 305 * character type = Py_UCS2 (16 bits, unsigned) 306 * at least one character must be in range U+0100-U+FFFF 307 308 - PyUnicode_4BYTE_KIND (3): 309 310 * character type = Py_UCS4 (32 bits, unsigned) 311 * at least one character must be in range U+10000-U+10FFFF 312 */ 313 unsigned int kind:2; 314 /* Compact is with respect to the allocation scheme. Compact unicode 315 objects only require one memory block while non-compact objects use 316 one block for the PyUnicodeObject struct and another for its data 317 buffer. */ 318 unsigned int compact:1; 319 /* The string only contains characters in range U+0000-U+007F (ASCII) 320 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is 321 set, use the PyASCIIObject structure. */ 322 unsigned int ascii:1; 323 /* The ready flag indicates whether the object layout is initialized 324 completely. This means that this is either a compact object, or 325 the data pointer is filled out. The bit is redundant, and helps 326 to minimize the test in PyUnicode_IS_READY(). */ 327 unsigned int ready:1; 328 } state; 329 wchar_t *wstr; /* wchar_t representation (null-terminated) */ 330} PyASCIIObject; 331 332/* Non-ASCII strings allocated through PyUnicode_New use the 333 PyCompactUnicodeObject structure. state.compact is set, and the data 334 immediately follow the structure. */ 335typedef struct { 336 PyASCIIObject _base; 337 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 338 * terminating \0. */ 339 char *utf8; /* UTF-8 representation (null-terminated) */ 340 Py_ssize_t wstr_length; /* Number of code points in wstr, possible 341 * surrogates count as two code points. */ 342} PyCompactUnicodeObject; 343 344/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the 345 PyUnicodeObject structure. The actual string data is initially in the wstr 346 block, and copied into the data block using _PyUnicode_Ready. */ 347typedef struct { 348 PyCompactUnicodeObject _base; 349 union { 350 void *any; 351 Py_UCS1 *latin1; 352 Py_UCS2 *ucs2; 353 Py_UCS4 *ucs4; 354 } data; /* Canonical, smallest-form Unicode buffer */ 355} PyUnicodeObject; 356#endif 357 358PyAPI_DATA(PyTypeObject) PyUnicode_Type; 359PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 360 361#define PyUnicode_Check(op) \ 362 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 363#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 364 365/* Fast access macros */ 366#ifndef Py_LIMITED_API 367 368#define PyUnicode_WSTR_LENGTH(op) \ 369 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 370 ((PyASCIIObject*)op)->length : \ 371 ((PyCompactUnicodeObject*)op)->wstr_length) 372 373/* Returns the deprecated Py_UNICODE representation's size in code units 374 (this includes surrogate pairs as 2 units). 375 If the Py_UNICODE representation is not available, it will be computed 376 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ 377 378#define PyUnicode_GET_SIZE(op) \ 379 (assert(PyUnicode_Check(op)), \ 380 (((PyASCIIObject *)(op))->wstr) ? \ 381 PyUnicode_WSTR_LENGTH(op) : \ 382 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ 383 PyUnicode_WSTR_LENGTH(op))) 384 385#define PyUnicode_GET_DATA_SIZE(op) \ 386 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) 387 388/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE 389 representation on demand. Using this macro is very inefficient now, 390 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or 391 use PyUnicode_WRITE() and PyUnicode_READ(). */ 392 393#define PyUnicode_AS_UNICODE(op) \ 394 (assert(PyUnicode_Check(op)), \ 395 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ 396 PyUnicode_AsUnicode((PyObject *)(op))) 397 398#define PyUnicode_AS_DATA(op) \ 399 ((const char *)(PyUnicode_AS_UNICODE(op))) 400 401 402/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ 403 404/* Values for PyUnicodeObject.state: */ 405 406/* Interning state. */ 407#define SSTATE_NOT_INTERNED 0 408#define SSTATE_INTERNED_MORTAL 1 409#define SSTATE_INTERNED_IMMORTAL 2 410 411/* Return true if the string contains only ASCII characters, or 0 if not. The 412 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks 413 or Ready calls are performed. */ 414#define PyUnicode_IS_ASCII(op) \ 415 (((PyASCIIObject*)op)->state.ascii) 416 417/* Return true if the string is compact or 0 if not. 418 No type checks or Ready calls are performed. */ 419#define PyUnicode_IS_COMPACT(op) \ 420 (((PyASCIIObject*)(op))->state.compact) 421 422/* Return true if the string is a compact ASCII string (use PyASCIIObject 423 structure), or 0 if not. No type checks or Ready calls are performed. */ 424#define PyUnicode_IS_COMPACT_ASCII(op) \ 425 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op)) 426 427/* String contains only wstr byte characters. This is only possible 428 when the string was created with a legacy API and _PyUnicode_Ready() 429 has not been called yet. */ 430#define PyUnicode_WCHAR_KIND 0 431 432/* Return values of the PyUnicode_KIND() macro: */ 433 434#define PyUnicode_1BYTE_KIND 1 435#define PyUnicode_2BYTE_KIND 2 436#define PyUnicode_4BYTE_KIND 3 437 438 439/* Return the number of bytes the string uses to represent single characters, 440 this can be 1, 2 or 4. 441 442 See also PyUnicode_KIND_SIZE(). */ 443#define PyUnicode_CHARACTER_SIZE(op) \ 444 (((Py_ssize_t)1 << (PyUnicode_KIND(op) - 1))) 445 446/* Return pointers to the canonical representation cast to unsigned char, 447 Py_UCS2, or Py_UCS4 for direct character access. 448 No checks are performed, use PyUnicode_CHARACTER_SIZE or 449 PyUnicode_KIND() before to ensure these will work correctly. */ 450 451#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) 452#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) 453#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) 454 455/* Return one of the PyUnicode_*_KIND values defined above. */ 456#define PyUnicode_KIND(op) \ 457 (assert(PyUnicode_Check(op)), \ 458 assert(PyUnicode_IS_READY(op)), \ 459 ((PyASCIIObject *)(op))->state.kind) 460 461/* Return a void pointer to the raw unicode buffer. */ 462#define _PyUnicode_COMPACT_DATA(op) \ 463 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 464 ((void*)((PyASCIIObject*)(op) + 1)) : \ 465 ((void*)((PyCompactUnicodeObject*)(op) + 1))) 466 467#define _PyUnicode_NONCOMPACT_DATA(op) \ 468 (assert(((PyUnicodeObject*)(op))->data.any), \ 469 ((((PyUnicodeObject *)(op))->data.any))) 470 471#define PyUnicode_DATA(op) \ 472 (assert(PyUnicode_Check(op)), \ 473 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ 474 _PyUnicode_NONCOMPACT_DATA(op)) 475 476/* Compute (index * char_size) where char_size is 2 ** (kind - 1). 477 The index is a character index, the result is a size in bytes. 478 479 See also PyUnicode_CHARACTER_SIZE(). */ 480#define PyUnicode_KIND_SIZE(kind, index) \ 481 (((Py_ssize_t)(index)) << ((kind) - 1)) 482 483/* In the access macros below, "kind" may be evaluated more than once. 484 All other macro parameters are evaluated exactly once, so it is safe 485 to put side effects into them (such as increasing the index). */ 486 487/* Write into the canonical representation, this macro does not do any sanity 488 checks and is intended for usage in loops. The caller should cache the 489 kind and data pointers obtained from other macro calls. 490 index is the index in the string (starts at 0) and value is the new 491 code point value which should be written to that location. */ 492#define PyUnicode_WRITE(kind, data, index, value) \ 493 do { \ 494 switch ((kind)) { \ 495 case PyUnicode_1BYTE_KIND: { \ 496 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ 497 break; \ 498 } \ 499 case PyUnicode_2BYTE_KIND: { \ 500 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ 501 break; \ 502 } \ 503 default: { \ 504 assert((kind) == PyUnicode_4BYTE_KIND); \ 505 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ 506 } \ 507 } \ 508 } while (0) 509 510/* Read a code point from the string's canonical representation. No checks 511 or ready calls are performed. */ 512#define PyUnicode_READ(kind, data, index) \ 513 ((Py_UCS4) \ 514 ((kind) == PyUnicode_1BYTE_KIND ? \ 515 ((const Py_UCS1 *)(data))[(index)] : \ 516 ((kind) == PyUnicode_2BYTE_KIND ? \ 517 ((const Py_UCS2 *)(data))[(index)] : \ 518 ((const Py_UCS4 *)(data))[(index)] \ 519 ) \ 520 )) 521 522/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 523 calls PyUnicode_KIND() and might call it twice. For single reads, use 524 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 525 cache kind and use PyUnicode_READ instead. */ 526#define PyUnicode_READ_CHAR(unicode, index) \ 527 (assert(PyUnicode_Check(unicode)), \ 528 assert(PyUnicode_IS_READY(unicode)), \ 529 (Py_UCS4) \ 530 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ 531 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ 532 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ 533 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ 534 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ 535 ) \ 536 )) 537 538/* Returns the length of the unicode string. The caller has to make sure that 539 the string has it's canonical representation set before calling 540 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ 541#define PyUnicode_GET_LENGTH(op) \ 542 (assert(PyUnicode_Check(op)), \ 543 assert(PyUnicode_IS_READY(op)), \ 544 ((PyASCIIObject *)(op))->length) 545 546 547/* Fast check to determine whether an object is ready. Equivalent to 548 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ 549 550#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) 551 552/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best 553 case. If the canonical representation is not yet set, it will still call 554 _PyUnicode_Ready(). 555 Returns 0 on success and -1 on errors. */ 556#define PyUnicode_READY(op) \ 557 (assert(PyUnicode_Check(op)), \ 558 (PyUnicode_IS_READY(op) ? \ 559 0 : _PyUnicode_Ready((PyObject *)(op)))) 560 561/* Return a maximum character value which is suitable for creating another 562 string based on op. This is always an approximation but more efficient 563 than iterating over the string. */ 564#define PyUnicode_MAX_CHAR_VALUE(op) \ 565 (assert(PyUnicode_IS_READY(op)), \ 566 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \ 567 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ 568 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \ 569 (0x7fU) : (0xffU) \ 570 ) : \ 571 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ 572 (0xffffU) : (0x10ffffU) \ 573 )))) 574 575#endif 576 577/* --- Constants ---------------------------------------------------------- */ 578 579/* This Unicode character will be used as replacement character during 580 decoding if the errors argument is set to "replace". Note: the 581 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 582 Unicode 3.0. */ 583 584#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 585 586/* === Public API ========================================================= */ 587 588/* --- Plain Py_UNICODE --------------------------------------------------- */ 589 590/* With PEP 393, this is the recommended way to allocate a new unicode object. 591 This function will allocate the object and its buffer in a single memory 592 block. Objects created using this function are not resizable. */ 593#ifndef Py_LIMITED_API 594PyAPI_FUNC(PyObject*) PyUnicode_New( 595 Py_ssize_t size, /* Number of code points in the new string */ 596 Py_UCS4 maxchar /* maximum code point value in the string */ 597 ); 598#endif 599 600/* Initializes the canonical string representation from a the deprecated 601 wstr/Py_UNICODE representation. This function is used to convert Unicode 602 objects which were created using the old API to the new flexible format 603 introduced with PEP 393. 604 605 Don't call this function directly, use the public PyUnicode_READY() macro 606 instead. */ 607#ifndef Py_LIMITED_API 608PyAPI_FUNC(int) _PyUnicode_Ready( 609 PyObject *unicode /* Unicode object */ 610 ); 611#endif 612 613/* Get a copy of a Unicode string. */ 614PyAPI_FUNC(PyObject*) PyUnicode_Copy( 615 PyObject *unicode 616 ); 617 618/* Copy character from one unicode object into another, this function performs 619 character conversion when necessary and falls back to memcpy if possible. 620 621 Fail if to is too small (smaller than how_many or smaller than 622 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 623 kind(to), or if to has more than 1 reference. 624 625 Return the number of written character, or return -1 and raise an exception 626 on error. 627 628 Pseudo-code: 629 630 how_many = min(how_many, len(from) - from_start) 631 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 632 return how_many 633 634 Note: The function doesn't write a terminating null character. 635 */ 636#ifndef Py_LIMITED_API 637PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 638 PyObject *to, 639 Py_ssize_t to_start, 640 PyObject *from, 641 Py_ssize_t from_start, 642 Py_ssize_t how_many 643 ); 644#endif 645 646/* Create a Unicode Object from the Py_UNICODE buffer u of the given 647 size. 648 649 u may be NULL which causes the contents to be undefined. It is the 650 user's responsibility to fill in the needed data afterwards. Note 651 that modifying the Unicode object contents after construction is 652 only allowed if u was set to NULL. 653 654 The buffer is copied into the new object. */ 655 656#ifndef Py_LIMITED_API 657PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 658 const Py_UNICODE *u, /* Unicode buffer */ 659 Py_ssize_t size /* size of buffer */ 660 ); 661#endif 662 663/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 664PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 665 const char *u, /* UTF-8 encoded string */ 666 Py_ssize_t size /* size of buffer */ 667 ); 668 669/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 670 UTF-8 encoded bytes. The size is determined with strlen(). */ 671PyAPI_FUNC(PyObject*) PyUnicode_FromString( 672 const char *u /* UTF-8 encoded string */ 673 ); 674 675/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. 676 Scan the string to find the maximum character. */ 677#ifndef Py_LIMITED_API 678PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 679 int kind, 680 const void *buffer, 681 Py_ssize_t size); 682#endif 683 684PyAPI_FUNC(PyObject*) PyUnicode_Substring( 685 PyObject *str, 686 Py_ssize_t start, 687 Py_ssize_t end); 688 689/* Copy the string into a UCS4 buffer including the null character is copy_null 690 is set. Return NULL and raise an exception on error. Raise a ValueError if 691 the buffer is smaller than the string. Return buffer on success. 692 693 buflen is the length of the buffer in (Py_UCS4) characters. */ 694PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 695 PyObject *unicode, 696 Py_UCS4* buffer, 697 Py_ssize_t buflen, 698 int copy_null); 699 700/* Copy the string into a UCS4 buffer. A new buffer is allocated using 701 * PyMem_Malloc; if this fails, NULL is returned with a memory error 702 exception set. */ 703PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 704 705/* Return a read-only pointer to the Unicode object's internal 706 Py_UNICODE buffer. 707 If the wchar_t/Py_UNICODE representation is not yet available, this 708 function will calculate it. */ 709 710#ifndef Py_LIMITED_API 711PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 712 PyObject *unicode /* Unicode object */ 713 ); 714#endif 715 716/* Return a read-only pointer to the Unicode object's internal 717 Py_UNICODE buffer and save the length at size. 718 If the wchar_t/Py_UNICODE representation is not yet available, this 719 function will calculate it. */ 720 721#ifndef Py_LIMITED_API 722PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( 723 PyObject *unicode, /* Unicode object */ 724 Py_ssize_t *size /* location where to save the length */ 725 ); 726#endif 727 728/* Get the length of the Unicode object. */ 729 730PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 731 PyObject *unicode 732); 733 734/* Get the number of Py_UNICODE units in the 735 string representation. */ 736 737PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 738 PyObject *unicode /* Unicode object */ 739 ); 740 741/* Read a character from the string. */ 742 743PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 744 PyObject *unicode, 745 Py_ssize_t index 746 ); 747 748/* Write a character to the string. The string must have been created through 749 PyUnicode_New, must not be shared, and must not have been hashed yet. 750 751 Return 0 on success, -1 on error. */ 752 753PyAPI_FUNC(int) PyUnicode_WriteChar( 754 PyObject *unicode, 755 Py_ssize_t index, 756 Py_UCS4 character 757 ); 758 759#ifndef Py_LIMITED_API 760/* Get the maximum ordinal for a Unicode character. */ 761PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 762#endif 763 764/* Resize an Unicode object allocated by the legacy API (e.g. 765 PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g. 766 PyUnicode_New) cannot be resized by this function. 767 768 The length is a number of Py_UNICODE characters (and not the number of code 769 points). 770 771 *unicode is modified to point to the new (resized) object and 0 772 returned on success. 773 774 If the refcount on the object is 1, the function resizes the string in 775 place, which is usually faster than allocating a new string (and copy 776 characters). 777 778 Error handling is implemented as follows: an exception is set, -1 779 is returned and *unicode left untouched. */ 780 781PyAPI_FUNC(int) PyUnicode_Resize( 782 PyObject **unicode, /* Pointer to the Unicode object */ 783 Py_ssize_t length /* New length */ 784 ); 785 786/* Coerce obj to an Unicode object and return a reference with 787 *incremented* refcount. 788 789 Coercion is done in the following way: 790 791 1. bytes, bytearray and other char buffer compatible objects are decoded 792 under the assumptions that they contain data using the UTF-8 793 encoding. Decoding is done in "strict" mode. 794 795 2. All other objects (including Unicode objects) raise an 796 exception. 797 798 The API returns NULL in case of an error. The caller is responsible 799 for decref'ing the returned objects. 800 801*/ 802 803PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 804 register PyObject *obj, /* Object */ 805 const char *encoding, /* encoding */ 806 const char *errors /* error handling */ 807 ); 808 809/* Coerce obj to an Unicode object and return a reference with 810 *incremented* refcount. 811 812 Unicode objects are passed back as-is (subclasses are converted to 813 true Unicode objects), all other objects are delegated to 814 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 815 using UTF-8 encoding as basis for decoding the object. 816 817 The API returns NULL in case of an error. The caller is responsible 818 for decref'ing the returned objects. 819 820*/ 821 822PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 823 register PyObject *obj /* Object */ 824 ); 825 826PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 827 const char *format, /* ASCII-encoded string */ 828 va_list vargs 829 ); 830PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 831 const char *format, /* ASCII-encoded string */ 832 ... 833 ); 834 835#ifndef Py_LIMITED_API 836/* Format the object based on the format_spec, as defined in PEP 3101 837 (Advanced String Formatting). */ 838PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj, 839 PyObject *format_spec, 840 Py_ssize_t start, 841 Py_ssize_t end); 842#endif 843 844PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 845PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 846PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 847 const char *u /* UTF-8 encoded string */ 848 ); 849#ifndef Py_LIMITED_API 850PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 851#endif 852 853/* Use only if you know it's a string */ 854#define PyUnicode_CHECK_INTERNED(op) \ 855 (((PyASCIIObject *)(op))->state.interned) 856 857/* --- wchar_t support for platforms which support it --------------------- */ 858 859#ifdef HAVE_WCHAR_H 860 861/* Create a Unicode Object from the wchar_t buffer w of the given 862 size. 863 864 The buffer is copied into the new object. */ 865 866PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 867 register const wchar_t *w, /* wchar_t buffer */ 868 Py_ssize_t size /* size of buffer */ 869 ); 870 871/* Copies the Unicode Object contents into the wchar_t buffer w. At 872 most size wchar_t characters are copied. 873 874 Note that the resulting wchar_t string may or may not be 875 0-terminated. It is the responsibility of the caller to make sure 876 that the wchar_t string is 0-terminated in case this is required by 877 the application. 878 879 Returns the number of wchar_t characters copied (excluding a 880 possibly trailing 0-termination character) or -1 in case of an 881 error. */ 882 883PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 884 PyObject *unicode, /* Unicode object */ 885 register wchar_t *w, /* wchar_t buffer */ 886 Py_ssize_t size /* size of buffer */ 887 ); 888 889/* Convert the Unicode object to a wide character string. The output string 890 always ends with a nul character. If size is not NULL, write the number of 891 wide characters (excluding the null character) into *size. 892 893 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) 894 on success. On error, returns NULL, *size is undefined and raises a 895 MemoryError. */ 896 897PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 898 PyObject *unicode, /* Unicode object */ 899 Py_ssize_t *size /* number of characters of the result */ 900 ); 901 902#ifndef Py_LIMITED_API 903PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); 904#endif 905 906#endif 907 908/* --- Unicode ordinals --------------------------------------------------- */ 909 910/* Create a Unicode Object from the given Unicode code point ordinal. 911 912 The ordinal must be in range(0x10000) on narrow Python builds 913 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 914 raised in case it is not. 915 916*/ 917 918PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 919 920/* --- Free-list management ----------------------------------------------- */ 921 922/* Clear the free list used by the Unicode implementation. 923 924 This can be used to release memory used for objects on the free 925 list back to the Python memory allocator. 926 927*/ 928 929PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 930 931/* === Builtin Codecs ===================================================== 932 933 Many of these APIs take two arguments encoding and errors. These 934 parameters encoding and errors have the same semantics as the ones 935 of the builtin str() API. 936 937 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 938 939 Error handling is set by errors which may also be set to NULL 940 meaning to use the default handling defined for the codec. Default 941 error handling for all builtin codecs is "strict" (ValueErrors are 942 raised). 943 944 The codecs all use a similar interface. Only deviation from the 945 generic ones are documented. 946 947*/ 948 949/* --- Manage the default encoding ---------------------------------------- */ 950 951/* Returns a pointer to the default encoding (UTF-8) of the 952 Unicode object unicode and the size of the encoded representation 953 in bytes stored in *size. 954 955 In case of an error, no *size is set. 956 957 This function caches the UTF-8 encoded string in the unicodeobject 958 and subsequent calls will return the same string. The memory is released 959 when the unicodeobject is deallocated. 960 961 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to 962 support the previous internal function with the same behaviour. 963 964 *** This API is for interpreter INTERNAL USE ONLY and will likely 965 *** be removed or changed in the future. 966 967 *** If you need to access the Unicode object as UTF-8 bytes string, 968 *** please use PyUnicode_AsUTF8String() instead. 969*/ 970 971#ifndef Py_LIMITED_API 972PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( 973 PyObject *unicode, 974 Py_ssize_t *size); 975#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize 976#endif 977 978/* Returns a pointer to the default encoding (UTF-8) of the 979 Unicode object unicode. 980 981 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 982 in the unicodeobject. 983 984 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 985 support the previous internal function with the same behaviour. 986 987 Use of this API is DEPRECATED since no size information can be 988 extracted from the returned data. 989 990 *** This API is for interpreter INTERNAL USE ONLY and will likely 991 *** be removed or changed for Python 3.1. 992 993 *** If you need to access the Unicode object as UTF-8 bytes string, 994 *** please use PyUnicode_AsUTF8String() instead. 995 996*/ 997 998#ifndef Py_LIMITED_API 999PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); 1000#define _PyUnicode_AsString PyUnicode_AsUTF8 1001#endif 1002 1003/* Returns "utf-8". */ 1004 1005PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 1006 1007/* --- Generic Codecs ----------------------------------------------------- */ 1008 1009/* Create a Unicode object by decoding the encoded string s of the 1010 given size. */ 1011 1012PyAPI_FUNC(PyObject*) PyUnicode_Decode( 1013 const char *s, /* encoded string */ 1014 Py_ssize_t size, /* size of buffer */ 1015 const char *encoding, /* encoding */ 1016 const char *errors /* error handling */ 1017 ); 1018 1019/* Decode a Unicode object unicode and return the result as Python 1020 object. */ 1021 1022PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 1023 PyObject *unicode, /* Unicode object */ 1024 const char *encoding, /* encoding */ 1025 const char *errors /* error handling */ 1026 ); 1027 1028/* Decode a Unicode object unicode and return the result as Unicode 1029 object. */ 1030 1031PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 1032 PyObject *unicode, /* Unicode object */ 1033 const char *encoding, /* encoding */ 1034 const char *errors /* error handling */ 1035 ); 1036 1037/* Encodes a Py_UNICODE buffer of the given size and returns a 1038 Python string object. */ 1039 1040#ifndef Py_LIMITED_API 1041PyAPI_FUNC(PyObject*) PyUnicode_Encode( 1042 const Py_UNICODE *s, /* Unicode char buffer */ 1043 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 1044 const char *encoding, /* encoding */ 1045 const char *errors /* error handling */ 1046 ); 1047#endif 1048 1049/* Encodes a Unicode object and returns the result as Python 1050 object. */ 1051 1052PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 1053 PyObject *unicode, /* Unicode object */ 1054 const char *encoding, /* encoding */ 1055 const char *errors /* error handling */ 1056 ); 1057 1058/* Encodes a Unicode object and returns the result as Python string 1059 object. */ 1060 1061PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 1062 PyObject *unicode, /* Unicode object */ 1063 const char *encoding, /* encoding */ 1064 const char *errors /* error handling */ 1065 ); 1066 1067/* Encodes a Unicode object and returns the result as Unicode 1068 object. */ 1069 1070PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 1071 PyObject *unicode, /* Unicode object */ 1072 const char *encoding, /* encoding */ 1073 const char *errors /* error handling */ 1074 ); 1075 1076/* Build an encoding map. */ 1077 1078PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 1079 PyObject* string /* 256 character map */ 1080 ); 1081 1082/* --- UTF-7 Codecs ------------------------------------------------------- */ 1083 1084PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 1085 const char *string, /* UTF-7 encoded string */ 1086 Py_ssize_t length, /* size of string */ 1087 const char *errors /* error handling */ 1088 ); 1089 1090PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 1091 const char *string, /* UTF-7 encoded string */ 1092 Py_ssize_t length, /* size of string */ 1093 const char *errors, /* error handling */ 1094 Py_ssize_t *consumed /* bytes consumed */ 1095 ); 1096 1097#ifndef Py_LIMITED_API 1098PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 1099 const Py_UNICODE *data, /* Unicode char buffer */ 1100 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1101 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1102 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1103 const char *errors /* error handling */ 1104 ); 1105#endif 1106 1107/* --- UTF-8 Codecs ------------------------------------------------------- */ 1108 1109PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 1110 const char *string, /* UTF-8 encoded string */ 1111 Py_ssize_t length, /* size of string */ 1112 const char *errors /* error handling */ 1113 ); 1114 1115PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 1116 const char *string, /* UTF-8 encoded string */ 1117 Py_ssize_t length, /* size of string */ 1118 const char *errors, /* error handling */ 1119 Py_ssize_t *consumed /* bytes consumed */ 1120 ); 1121 1122PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 1123 PyObject *unicode /* Unicode object */ 1124 ); 1125 1126#ifndef Py_LIMITED_API 1127PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 1128 PyObject *unicode, 1129 const char *errors); 1130 1131PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 1132 const Py_UNICODE *data, /* Unicode char buffer */ 1133 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1134 const char *errors /* error handling */ 1135 ); 1136#endif 1137 1138/* --- UTF-32 Codecs ------------------------------------------------------ */ 1139 1140/* Decodes length bytes from a UTF-32 encoded buffer string and returns 1141 the corresponding Unicode object. 1142 1143 errors (if non-NULL) defines the error handling. It defaults 1144 to "strict". 1145 1146 If byteorder is non-NULL, the decoder starts decoding using the 1147 given byte order: 1148 1149 *byteorder == -1: little endian 1150 *byteorder == 0: native order 1151 *byteorder == 1: big endian 1152 1153 In native mode, the first four bytes of the stream are checked for a 1154 BOM mark. If found, the BOM mark is analysed, the byte order 1155 adjusted and the BOM skipped. In the other modes, no BOM mark 1156 interpretation is done. After completion, *byteorder is set to the 1157 current byte order at the end of input data. 1158 1159 If byteorder is NULL, the codec starts in native order mode. 1160 1161*/ 1162 1163PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 1164 const char *string, /* UTF-32 encoded string */ 1165 Py_ssize_t length, /* size of string */ 1166 const char *errors, /* error handling */ 1167 int *byteorder /* pointer to byteorder to use 1168 0=native;-1=LE,1=BE; updated on 1169 exit */ 1170 ); 1171 1172PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 1173 const char *string, /* UTF-32 encoded string */ 1174 Py_ssize_t length, /* size of string */ 1175 const char *errors, /* error handling */ 1176 int *byteorder, /* pointer to byteorder to use 1177 0=native;-1=LE,1=BE; updated on 1178 exit */ 1179 Py_ssize_t *consumed /* bytes consumed */ 1180 ); 1181 1182/* Returns a Python string using the UTF-32 encoding in native byte 1183 order. The string always starts with a BOM mark. */ 1184 1185PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 1186 PyObject *unicode /* Unicode object */ 1187 ); 1188 1189/* Returns a Python string object holding the UTF-32 encoded value of 1190 the Unicode data. 1191 1192 If byteorder is not 0, output is written according to the following 1193 byte order: 1194 1195 byteorder == -1: little endian 1196 byteorder == 0: native byte order (writes a BOM mark) 1197 byteorder == 1: big endian 1198 1199 If byteorder is 0, the output string will always start with the 1200 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1201 prepended. 1202 1203*/ 1204 1205#ifndef Py_LIMITED_API 1206PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 1207 const Py_UNICODE *data, /* Unicode char buffer */ 1208 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1209 const char *errors, /* error handling */ 1210 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1211 ); 1212#endif 1213 1214/* --- UTF-16 Codecs ------------------------------------------------------ */ 1215 1216/* Decodes length bytes from a UTF-16 encoded buffer string and returns 1217 the corresponding Unicode object. 1218 1219 errors (if non-NULL) defines the error handling. It defaults 1220 to "strict". 1221 1222 If byteorder is non-NULL, the decoder starts decoding using the 1223 given byte order: 1224 1225 *byteorder == -1: little endian 1226 *byteorder == 0: native order 1227 *byteorder == 1: big endian 1228 1229 In native mode, the first two bytes of the stream are checked for a 1230 BOM mark. If found, the BOM mark is analysed, the byte order 1231 adjusted and the BOM skipped. In the other modes, no BOM mark 1232 interpretation is done. After completion, *byteorder is set to the 1233 current byte order at the end of input data. 1234 1235 If byteorder is NULL, the codec starts in native order mode. 1236 1237*/ 1238 1239PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 1240 const char *string, /* UTF-16 encoded string */ 1241 Py_ssize_t length, /* size of string */ 1242 const char *errors, /* error handling */ 1243 int *byteorder /* pointer to byteorder to use 1244 0=native;-1=LE,1=BE; updated on 1245 exit */ 1246 ); 1247 1248PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 1249 const char *string, /* UTF-16 encoded string */ 1250 Py_ssize_t length, /* size of string */ 1251 const char *errors, /* error handling */ 1252 int *byteorder, /* pointer to byteorder to use 1253 0=native;-1=LE,1=BE; updated on 1254 exit */ 1255 Py_ssize_t *consumed /* bytes consumed */ 1256 ); 1257 1258/* Returns a Python string using the UTF-16 encoding in native byte 1259 order. The string always starts with a BOM mark. */ 1260 1261PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 1262 PyObject *unicode /* Unicode object */ 1263 ); 1264 1265/* Returns a Python string object holding the UTF-16 encoded value of 1266 the Unicode data. 1267 1268 If byteorder is not 0, output is written according to the following 1269 byte order: 1270 1271 byteorder == -1: little endian 1272 byteorder == 0: native byte order (writes a BOM mark) 1273 byteorder == 1: big endian 1274 1275 If byteorder is 0, the output string will always start with the 1276 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1277 prepended. 1278 1279 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 1280 UCS-2. This trick makes it possible to add full UTF-16 capabilities 1281 at a later point without compromising the APIs. 1282 1283*/ 1284 1285#ifndef Py_LIMITED_API 1286PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 1287 const Py_UNICODE *data, /* Unicode char buffer */ 1288 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1289 const char *errors, /* error handling */ 1290 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1291 ); 1292#endif 1293 1294/* --- Unicode-Escape Codecs ---------------------------------------------- */ 1295 1296PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1297 const char *string, /* Unicode-Escape encoded string */ 1298 Py_ssize_t length, /* size of string */ 1299 const char *errors /* error handling */ 1300 ); 1301 1302PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1303 PyObject *unicode /* Unicode object */ 1304 ); 1305 1306#ifndef Py_LIMITED_API 1307PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1308 const Py_UNICODE *data, /* Unicode char buffer */ 1309 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1310 ); 1311#endif 1312 1313/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1314 1315PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1316 const char *string, /* Raw-Unicode-Escape encoded string */ 1317 Py_ssize_t length, /* size of string */ 1318 const char *errors /* error handling */ 1319 ); 1320 1321PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1322 PyObject *unicode /* Unicode object */ 1323 ); 1324 1325#ifndef Py_LIMITED_API 1326PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1327 const Py_UNICODE *data, /* Unicode char buffer */ 1328 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1329 ); 1330#endif 1331 1332/* --- Unicode Internal Codec --------------------------------------------- 1333 1334 Only for internal use in _codecsmodule.c */ 1335 1336#ifndef Py_LIMITED_API 1337PyObject *_PyUnicode_DecodeUnicodeInternal( 1338 const char *string, 1339 Py_ssize_t length, 1340 const char *errors 1341 ); 1342#endif 1343 1344/* --- Latin-1 Codecs ----------------------------------------------------- 1345 1346 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1347 1348*/ 1349 1350PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1351 const char *string, /* Latin-1 encoded string */ 1352 Py_ssize_t length, /* size of string */ 1353 const char *errors /* error handling */ 1354 ); 1355 1356PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1357 PyObject *unicode /* Unicode object */ 1358 ); 1359 1360#ifndef Py_LIMITED_API 1361PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 1362 PyObject* unicode, 1363 const char* errors); 1364 1365PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1366 const Py_UNICODE *data, /* Unicode char buffer */ 1367 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1368 const char *errors /* error handling */ 1369 ); 1370#endif 1371 1372/* --- ASCII Codecs ------------------------------------------------------- 1373 1374 Only 7-bit ASCII data is excepted. All other codes generate errors. 1375 1376*/ 1377 1378PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1379 const char *string, /* ASCII encoded string */ 1380 Py_ssize_t length, /* size of string */ 1381 const char *errors /* error handling */ 1382 ); 1383 1384PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1385 PyObject *unicode /* Unicode object */ 1386 ); 1387 1388#ifndef Py_LIMITED_API 1389PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 1390 PyObject* unicode, 1391 const char* errors); 1392 1393PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1394 const Py_UNICODE *data, /* Unicode char buffer */ 1395 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1396 const char *errors /* error handling */ 1397 ); 1398#endif 1399 1400/* --- Character Map Codecs ----------------------------------------------- 1401 1402 This codec uses mappings to encode and decode characters. 1403 1404 Decoding mappings must map single string characters to single 1405 Unicode characters, integers (which are then interpreted as Unicode 1406 ordinals) or None (meaning "undefined mapping" and causing an 1407 error). 1408 1409 Encoding mappings must map single Unicode characters to single 1410 string characters, integers (which are then interpreted as Latin-1 1411 ordinals) or None (meaning "undefined mapping" and causing an 1412 error). 1413 1414 If a character lookup fails with a LookupError, the character is 1415 copied as-is meaning that its ordinal value will be interpreted as 1416 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1417 to contain those mappings which map characters to different code 1418 points. 1419 1420*/ 1421 1422PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1423 const char *string, /* Encoded string */ 1424 Py_ssize_t length, /* size of string */ 1425 PyObject *mapping, /* character mapping 1426 (char ordinal -> unicode ordinal) */ 1427 const char *errors /* error handling */ 1428 ); 1429 1430PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1431 PyObject *unicode, /* Unicode object */ 1432 PyObject *mapping /* character mapping 1433 (unicode ordinal -> char ordinal) */ 1434 ); 1435 1436#ifndef Py_LIMITED_API 1437PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1438 const Py_UNICODE *data, /* Unicode char buffer */ 1439 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1440 PyObject *mapping, /* character mapping 1441 (unicode ordinal -> char ordinal) */ 1442 const char *errors /* error handling */ 1443 ); 1444#endif 1445 1446/* Translate a Py_UNICODE buffer of the given length by applying a 1447 character mapping table to it and return the resulting Unicode 1448 object. 1449 1450 The mapping table must map Unicode ordinal integers to Unicode 1451 ordinal integers or None (causing deletion of the character). 1452 1453 Mapping tables may be dictionaries or sequences. Unmapped character 1454 ordinals (ones which cause a LookupError) are left untouched and 1455 are copied as-is. 1456 1457*/ 1458 1459#ifndef Py_LIMITED_API 1460PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1461 const Py_UNICODE *data, /* Unicode char buffer */ 1462 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1463 PyObject *table, /* Translate table */ 1464 const char *errors /* error handling */ 1465 ); 1466#endif 1467 1468#ifdef HAVE_MBCS 1469 1470/* --- MBCS codecs for Windows -------------------------------------------- */ 1471 1472PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1473 const char *string, /* MBCS encoded string */ 1474 Py_ssize_t length, /* size of string */ 1475 const char *errors /* error handling */ 1476 ); 1477 1478PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1479 const char *string, /* MBCS encoded string */ 1480 Py_ssize_t length, /* size of string */ 1481 const char *errors, /* error handling */ 1482 Py_ssize_t *consumed /* bytes consumed */ 1483 ); 1484 1485PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1486 PyObject *unicode /* Unicode object */ 1487 ); 1488 1489#ifndef Py_LIMITED_API 1490PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1491 const Py_UNICODE *data, /* Unicode char buffer */ 1492 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1493 const char *errors /* error handling */ 1494 ); 1495#endif 1496 1497#endif /* HAVE_MBCS */ 1498 1499/* --- Decimal Encoder ---------------------------------------------------- */ 1500 1501/* Takes a Unicode string holding a decimal value and writes it into 1502 an output buffer using standard ASCII digit codes. 1503 1504 The output buffer has to provide at least length+1 bytes of storage 1505 area. The output string is 0-terminated. 1506 1507 The encoder converts whitespace to ' ', decimal characters to their 1508 corresponding ASCII digit and all other Latin-1 characters except 1509 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1510 are treated as errors. This includes embedded NULL bytes. 1511 1512 Error handling is defined by the errors argument: 1513 1514 NULL or "strict": raise a ValueError 1515 "ignore": ignore the wrong characters (these are not copied to the 1516 output buffer) 1517 "replace": replaces illegal characters with '?' 1518 1519 Returns 0 on success, -1 on failure. 1520 1521*/ 1522 1523#ifndef Py_LIMITED_API 1524PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1525 Py_UNICODE *s, /* Unicode buffer */ 1526 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1527 char *output, /* Output buffer; must have size >= length */ 1528 const char *errors /* error handling */ 1529 ); 1530#endif 1531 1532/* Transforms code points that have decimal digit property to the 1533 corresponding ASCII digit code points. 1534 1535 Returns a new Unicode string on success, NULL on failure. 1536*/ 1537 1538#ifndef Py_LIMITED_API 1539PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( 1540 Py_UNICODE *s, /* Unicode buffer */ 1541 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ 1542 ); 1543#endif 1544 1545/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject 1546 as argument instead of a raw buffer and length. This function additionally 1547 transforms spaces to ASCII because this is what the callers in longobject, 1548 floatobject, and complexobject did anyways. */ 1549 1550#ifndef Py_LIMITED_API 1551PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 1552 PyObject *unicode /* Unicode object */ 1553 ); 1554#endif 1555 1556/* --- File system encoding ---------------------------------------------- */ 1557 1558/* ParseTuple converter: encode str objects to bytes using 1559 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1560 1561PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1562 1563/* ParseTuple converter: decode bytes objects to unicode using 1564 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1565 1566PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1567 1568/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1569 and the "surrogateescape" error handler. 1570 1571 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1572 encoding. 1573 1574 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1575*/ 1576 1577PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1578 const char *s /* encoded string */ 1579 ); 1580 1581/* Decode a string using Py_FileSystemDefaultEncoding 1582 and the "surrogateescape" error handler. 1583 1584 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1585 encoding. 1586*/ 1587 1588PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1589 const char *s, /* encoded string */ 1590 Py_ssize_t size /* size */ 1591 ); 1592 1593/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1594 "surrogateescape" error handler, and return bytes. 1595 1596 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1597 encoding. 1598*/ 1599 1600PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1601 PyObject *unicode 1602 ); 1603 1604/* --- Methods & Slots ---------------------------------------------------- 1605 1606 These are capable of handling Unicode objects and strings on input 1607 (we refer to them as strings in the descriptions) and return 1608 Unicode objects or integers as appropriate. */ 1609 1610/* Concat two strings giving a new Unicode string. */ 1611 1612PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1613 PyObject *left, /* Left string */ 1614 PyObject *right /* Right string */ 1615 ); 1616 1617/* Concat two strings and put the result in *pleft 1618 (sets *pleft to NULL on error) */ 1619 1620PyAPI_FUNC(void) PyUnicode_Append( 1621 PyObject **pleft, /* Pointer to left string */ 1622 PyObject *right /* Right string */ 1623 ); 1624 1625/* Concat two strings, put the result in *pleft and drop the right object 1626 (sets *pleft to NULL on error) */ 1627 1628PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1629 PyObject **pleft, /* Pointer to left string */ 1630 PyObject *right /* Right string */ 1631 ); 1632 1633/* Split a string giving a list of Unicode strings. 1634 1635 If sep is NULL, splitting will be done at all whitespace 1636 substrings. Otherwise, splits occur at the given separator. 1637 1638 At most maxsplit splits will be done. If negative, no limit is set. 1639 1640 Separators are not included in the resulting list. 1641 1642*/ 1643 1644PyAPI_FUNC(PyObject*) PyUnicode_Split( 1645 PyObject *s, /* String to split */ 1646 PyObject *sep, /* String separator */ 1647 Py_ssize_t maxsplit /* Maxsplit count */ 1648 ); 1649 1650/* Dito, but split at line breaks. 1651 1652 CRLF is considered to be one line break. Line breaks are not 1653 included in the resulting list. */ 1654 1655PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1656 PyObject *s, /* String to split */ 1657 int keepends /* If true, line end markers are included */ 1658 ); 1659 1660/* Partition a string using a given separator. */ 1661 1662PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1663 PyObject *s, /* String to partition */ 1664 PyObject *sep /* String separator */ 1665 ); 1666 1667/* Partition a string using a given separator, searching from the end of the 1668 string. */ 1669 1670PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1671 PyObject *s, /* String to partition */ 1672 PyObject *sep /* String separator */ 1673 ); 1674 1675/* Split a string giving a list of Unicode strings. 1676 1677 If sep is NULL, splitting will be done at all whitespace 1678 substrings. Otherwise, splits occur at the given separator. 1679 1680 At most maxsplit splits will be done. But unlike PyUnicode_Split 1681 PyUnicode_RSplit splits from the end of the string. If negative, 1682 no limit is set. 1683 1684 Separators are not included in the resulting list. 1685 1686*/ 1687 1688PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1689 PyObject *s, /* String to split */ 1690 PyObject *sep, /* String separator */ 1691 Py_ssize_t maxsplit /* Maxsplit count */ 1692 ); 1693 1694/* Translate a string by applying a character mapping table to it and 1695 return the resulting Unicode object. 1696 1697 The mapping table must map Unicode ordinal integers to Unicode 1698 ordinal integers or None (causing deletion of the character). 1699 1700 Mapping tables may be dictionaries or sequences. Unmapped character 1701 ordinals (ones which cause a LookupError) are left untouched and 1702 are copied as-is. 1703 1704*/ 1705 1706PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1707 PyObject *str, /* String */ 1708 PyObject *table, /* Translate table */ 1709 const char *errors /* error handling */ 1710 ); 1711 1712/* Join a sequence of strings using the given separator and return 1713 the resulting Unicode string. */ 1714 1715PyAPI_FUNC(PyObject*) PyUnicode_Join( 1716 PyObject *separator, /* Separator string */ 1717 PyObject *seq /* Sequence object */ 1718 ); 1719 1720/* Return 1 if substr matches str[start:end] at the given tail end, 0 1721 otherwise. */ 1722 1723PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1724 PyObject *str, /* String */ 1725 PyObject *substr, /* Prefix or Suffix string */ 1726 Py_ssize_t start, /* Start index */ 1727 Py_ssize_t end, /* Stop index */ 1728 int direction /* Tail end: -1 prefix, +1 suffix */ 1729 ); 1730 1731/* Return the first position of substr in str[start:end] using the 1732 given search direction or -1 if not found. -2 is returned in case 1733 an error occurred and an exception is set. */ 1734 1735PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1736 PyObject *str, /* String */ 1737 PyObject *substr, /* Substring to find */ 1738 Py_ssize_t start, /* Start index */ 1739 Py_ssize_t end, /* Stop index */ 1740 int direction /* Find direction: +1 forward, -1 backward */ 1741 ); 1742 1743/* Like PyUnicode_Find, but search for single character only. */ 1744PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 1745 PyObject *str, 1746 Py_UCS4 ch, 1747 Py_ssize_t start, 1748 Py_ssize_t end, 1749 int direction 1750 ); 1751 1752/* Count the number of occurrences of substr in str[start:end]. */ 1753 1754PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1755 PyObject *str, /* String */ 1756 PyObject *substr, /* Substring to count */ 1757 Py_ssize_t start, /* Start index */ 1758 Py_ssize_t end /* Stop index */ 1759 ); 1760 1761/* Replace at most maxcount occurrences of substr in str with replstr 1762 and return the resulting Unicode object. */ 1763 1764PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1765 PyObject *str, /* String */ 1766 PyObject *substr, /* Substring to find */ 1767 PyObject *replstr, /* Substring to replace */ 1768 Py_ssize_t maxcount /* Max. number of replacements to apply; 1769 -1 = all */ 1770 ); 1771 1772/* Compare two strings and return -1, 0, 1 for less than, equal, 1773 greater than resp. */ 1774 1775PyAPI_FUNC(int) PyUnicode_Compare( 1776 PyObject *left, /* Left string */ 1777 PyObject *right /* Right string */ 1778 ); 1779 1780PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1781 PyObject *left, 1782 const char *right /* ASCII-encoded string */ 1783 ); 1784 1785/* Rich compare two strings and return one of the following: 1786 1787 - NULL in case an exception was raised 1788 - Py_True or Py_False for successfully comparisons 1789 - Py_NotImplemented in case the type combination is unknown 1790 1791 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1792 case the conversion of the arguments to Unicode fails with a 1793 UnicodeDecodeError. 1794 1795 Possible values for op: 1796 1797 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1798 1799*/ 1800 1801PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1802 PyObject *left, /* Left string */ 1803 PyObject *right, /* Right string */ 1804 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1805 ); 1806 1807/* Apply a argument tuple or dictionary to a format string and return 1808 the resulting Unicode string. */ 1809 1810PyAPI_FUNC(PyObject *) PyUnicode_Format( 1811 PyObject *format, /* Format string */ 1812 PyObject *args /* Argument tuple or dictionary */ 1813 ); 1814 1815/* Checks whether element is contained in container and return 1/0 1816 accordingly. 1817 1818 element has to coerce to an one element Unicode string. -1 is 1819 returned in case of an error. */ 1820 1821PyAPI_FUNC(int) PyUnicode_Contains( 1822 PyObject *container, /* Container string */ 1823 PyObject *element /* Element string */ 1824 ); 1825 1826/* Checks whether argument is a valid identifier. */ 1827 1828PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1829 1830#ifndef Py_LIMITED_API 1831/* Externally visible for str.strip(unicode) */ 1832PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1833 PyUnicodeObject *self, 1834 int striptype, 1835 PyObject *sepobj 1836 ); 1837#endif 1838 1839/* Using the current locale, insert the thousands grouping 1840 into the string pointed to by buffer. For the argument descriptions, 1841 see Objects/stringlib/localeutil.h */ 1842 1843#ifndef Py_LIMITED_API 1844PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer, 1845 Py_ssize_t n_buffer, 1846 Py_UNICODE *digits, 1847 Py_ssize_t n_digits, 1848 Py_ssize_t min_width); 1849#endif 1850 1851/* Using explicit passed-in values, insert the thousands grouping 1852 into the string pointed to by buffer. For the argument descriptions, 1853 see Objects/stringlib/localeutil.h */ 1854#ifndef Py_LIMITED_API 1855PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 1856 PyObject *unicode, 1857 int kind, 1858 void *buffer, 1859 Py_ssize_t n_buffer, 1860 void *digits, 1861 Py_ssize_t n_digits, 1862 Py_ssize_t min_width, 1863 const char *grouping, 1864 const char *thousands_sep); 1865#endif 1866/* === Characters Type APIs =============================================== */ 1867 1868/* Helper array used by Py_UNICODE_ISSPACE(). */ 1869 1870#ifndef Py_LIMITED_API 1871PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 1872 1873/* These should not be used directly. Use the Py_UNICODE_IS* and 1874 Py_UNICODE_TO* macros instead. 1875 1876 These APIs are implemented in Objects/unicodectype.c. 1877 1878*/ 1879 1880PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1881 Py_UCS4 ch /* Unicode character */ 1882 ); 1883 1884PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1885 Py_UCS4 ch /* Unicode character */ 1886 ); 1887 1888PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1889 Py_UCS4 ch /* Unicode character */ 1890 ); 1891 1892PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1893 Py_UCS4 ch /* Unicode character */ 1894 ); 1895 1896PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1897 Py_UCS4 ch /* Unicode character */ 1898 ); 1899 1900PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1901 const Py_UCS4 ch /* Unicode character */ 1902 ); 1903 1904PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1905 const Py_UCS4 ch /* Unicode character */ 1906 ); 1907 1908PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 1909 Py_UCS4 ch /* Unicode character */ 1910 ); 1911 1912PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 1913 Py_UCS4 ch /* Unicode character */ 1914 ); 1915 1916PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 1917 Py_UCS4 ch /* Unicode character */ 1918 ); 1919 1920PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1921 Py_UCS4 ch /* Unicode character */ 1922 ); 1923 1924PyAPI_FUNC(int) _PyUnicode_ToDigit( 1925 Py_UCS4 ch /* Unicode character */ 1926 ); 1927 1928PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1929 Py_UCS4 ch /* Unicode character */ 1930 ); 1931 1932PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1933 Py_UCS4 ch /* Unicode character */ 1934 ); 1935 1936PyAPI_FUNC(int) _PyUnicode_IsDigit( 1937 Py_UCS4 ch /* Unicode character */ 1938 ); 1939 1940PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1941 Py_UCS4 ch /* Unicode character */ 1942 ); 1943 1944PyAPI_FUNC(int) _PyUnicode_IsPrintable( 1945 Py_UCS4 ch /* Unicode character */ 1946 ); 1947 1948PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1949 Py_UCS4 ch /* Unicode character */ 1950 ); 1951 1952PyAPI_FUNC(size_t) Py_UNICODE_strlen( 1953 const Py_UNICODE *u 1954 ); 1955 1956PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1957 Py_UNICODE *s1, 1958 const Py_UNICODE *s2); 1959 1960PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 1961 Py_UNICODE *s1, const Py_UNICODE *s2); 1962 1963PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1964 Py_UNICODE *s1, 1965 const Py_UNICODE *s2, 1966 size_t n); 1967 1968PyAPI_FUNC(int) Py_UNICODE_strcmp( 1969 const Py_UNICODE *s1, 1970 const Py_UNICODE *s2 1971 ); 1972 1973PyAPI_FUNC(int) Py_UNICODE_strncmp( 1974 const Py_UNICODE *s1, 1975 const Py_UNICODE *s2, 1976 size_t n 1977 ); 1978 1979PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1980 const Py_UNICODE *s, 1981 Py_UNICODE c 1982 ); 1983 1984PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 1985 const Py_UNICODE *s, 1986 Py_UNICODE c 1987 ); 1988 1989PyAPI_FUNC(size_t) Py_UCS4_strlen( 1990 const Py_UCS4 *u 1991 ); 1992 1993PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy( 1994 Py_UCS4 *s1, 1995 const Py_UCS4 *s2); 1996 1997PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat( 1998 Py_UCS4 *s1, const Py_UCS4 *s2); 1999 2000PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy( 2001 Py_UCS4 *s1, 2002 const Py_UCS4 *s2, 2003 size_t n); 2004 2005PyAPI_FUNC(int) Py_UCS4_strcmp( 2006 const Py_UCS4 *s1, 2007 const Py_UCS4 *s2 2008 ); 2009 2010PyAPI_FUNC(int) Py_UCS4_strncmp( 2011 const Py_UCS4 *s1, 2012 const Py_UCS4 *s2, 2013 size_t n 2014 ); 2015 2016PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr( 2017 const Py_UCS4 *s, 2018 Py_UCS4 c 2019 ); 2020 2021PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr( 2022 const Py_UCS4 *s, 2023 Py_UCS4 c 2024 ); 2025 2026/* Create a copy of a unicode string ending with a nul character. Return NULL 2027 and raise a MemoryError exception on memory allocation failure, otherwise 2028 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 2029 2030PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 2031 PyObject *unicode 2032 ); 2033#endif /* Py_LIMITED_API */ 2034 2035#if defined(Py_DEBUG) && !defined(Py_LIMITED_API) 2036/* FIXME: use PyObject* type for op */ 2037PyAPI_FUNC(int) _PyUnicode_CheckConsistency( 2038 void *op, 2039 int check_content); 2040#endif 2041 2042#ifdef __cplusplus 2043} 2044#endif 2045#endif /* !Py_UNICODEOBJECT_H */ 2046