unicodeobject.h revision 02b75abf731831e32bbb8007a3278c14f6ad700a
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal. (See 11http://www.egenix.com/files/python/unicode-proposal.txt). 12 13Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58#include <ctype.h> 59 60/* === Internal API ======================================================= */ 61 62/* --- Internal Unicode Format -------------------------------------------- */ 63 64/* Python 3.x requires unicode */ 65#define Py_USING_UNICODE 66 67#ifndef SIZEOF_WCHAR_T 68#error Must define SIZEOF_WCHAR_T 69#endif 70 71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Py_UNICODE was the native Unicode storage format (code unit) used by 87 Python and represents a single Unicode element in the Unicode type. 88 With PEP 393, Py_UNICODE is deprecated and replaced with a 89 typedef to wchar_t. */ 90 91#ifndef Py_LIMITED_API 92#define PY_UNICODE_TYPE wchar_t 93typedef wchar_t Py_UNICODE; 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar(), 98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#if defined(MS_WINDOWS) 107# define HAVE_MBCS 108#endif 109 110#ifdef HAVE_WCHAR_H 111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 112# ifdef _HAVE_BSDI 113# include <time.h> 114# endif 115# include <wchar.h> 116#endif 117 118/* Py_UCS4 and Py_UCS2 are typedefs for the respective 119 unicode representations. */ 120#if SIZEOF_INT == 4 121typedef unsigned int Py_UCS4; 122#elif SIZEOF_LONG == 4 123typedef unsigned long Py_UCS4; 124#else 125#error "Could not find a proper typedef for Py_UCS4" 126#endif 127 128#if SIZEOF_SHORT == 2 129typedef unsigned short Py_UCS2; 130#else 131#error "Could not find a proper typedef for Py_UCS2" 132#endif 133 134typedef unsigned char Py_UCS1; 135 136/* --- Internal Unicode Operations ---------------------------------------- */ 137 138/* Since splitting on whitespace is an important use case, and 139 whitespace in most situations is solely ASCII whitespace, we 140 optimize for the common case by using a quick look-up table 141 _Py_ascii_whitespace (see below) with an inlined check. 142 143 */ 144#ifndef Py_LIMITED_API 145#define Py_UNICODE_ISSPACE(ch) \ 146 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 147 148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 152 153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 156 157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 161 162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 165 166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 167 168#define Py_UNICODE_ISALNUM(ch) \ 169 (Py_UNICODE_ISALPHA(ch) || \ 170 Py_UNICODE_ISDECIMAL(ch) || \ 171 Py_UNICODE_ISDIGIT(ch) || \ 172 Py_UNICODE_ISNUMERIC(ch)) 173 174#define Py_UNICODE_COPY(target, source, length) \ 175 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 176 177#define Py_UNICODE_FILL(target, value, length) \ 178 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 179 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 180 } while (0) 181 182/* macros to work with surrogates */ 183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) 184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF) 185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF) 186/* Join two surrogate characters and return a single Py_UCS4 value. */ 187#define Py_UNICODE_JOIN_SURROGATES(high, low) \ 188 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 189 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 190/* high surrogate = top 10 bits added to D800 */ 191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) 192/* low surrogate = bottom 10 bits added to DC00 */ 193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) 194 195/* Check if substring matches at given offset. The offset must be 196 valid, and the substring must not be empty. */ 197 198#define Py_UNICODE_MATCH(string, offset, substring) \ 199 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ 200 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ 201 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) 202 203#endif /* Py_LIMITED_API */ 204 205#ifdef __cplusplus 206extern "C" { 207#endif 208 209/* --- Unicode Type ------------------------------------------------------- */ 210 211#ifndef Py_LIMITED_API 212 213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 214 structure. state.ascii and state.compact are set, and the data 215 immediately follow the structure. utf8_length and wstr_length can be found 216 in the length field; the utf8 pointer is equal to the data pointer. */ 217typedef struct { 218 /* There are 4 forms of Unicode strings: 219 220 - compact ascii: 221 222 * structure = PyASCIIObject 223 * test: PyUnicode_IS_COMPACT_ASCII(op) 224 * kind = PyUnicode_1BYTE_KIND 225 * compact = 1 226 * ascii = 1 227 * ready = 1 228 * (length is the length of the utf8 and wstr strings) 229 * (data starts just after the structure) 230 * (since ASCII is decoded from UTF-8, the utf8 string are the data) 231 232 - compact: 233 234 * structure = PyCompactUnicodeObject 235 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) 236 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 237 PyUnicode_4BYTE_KIND 238 * compact = 1 239 * ready = 1 240 * ascii = 0 241 * utf8 is not shared with data 242 * utf8_length = 0 if utf8 is NULL 243 * wstr is shared with data and wstr_length=length 244 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 245 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 246 * wstr_length = 0 if wstr is NULL 247 * (data starts just after the structure) 248 249 - legacy string, not ready: 250 251 * structure = PyUnicodeObject 252 * test: kind == PyUnicode_WCHAR_KIND 253 * length = 0 (use wstr_length) 254 * hash = -1 255 * kind = PyUnicode_WCHAR_KIND 256 * compact = 0 257 * ascii = 0 258 * ready = 0 259 * interned = SSTATE_NOT_INTERNED 260 * wstr is not NULL 261 * data.any is NULL 262 * utf8 is NULL 263 * utf8_length = 0 264 265 - legacy string, ready: 266 267 * structure = PyUnicodeObject structure 268 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND 269 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 270 PyUnicode_4BYTE_KIND 271 * compact = 0 272 * ready = 1 273 * data.any is not NULL 274 * utf8 is shared and utf8_length = length with data.any if ascii = 1 275 * utf8_length = 0 if utf8 is NULL 276 * wstr is shared with data.any and wstr_length = length 277 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 278 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 279 * wstr_length = 0 if wstr is NULL 280 281 Compact strings use only one memory block (structure + characters), 282 whereas legacy strings use one block for the structure and one block 283 for characters. 284 285 Legacy strings are created by PyUnicode_FromUnicode() and 286 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready 287 when PyUnicode_READY() is called. 288 289 See also _PyUnicode_CheckConsistency(). 290 */ 291 PyObject_HEAD 292 Py_ssize_t length; /* Number of code points in the string */ 293 Py_hash_t hash; /* Hash value; -1 if not set */ 294 struct { 295 /* 296 SSTATE_NOT_INTERNED (0) 297 SSTATE_INTERNED_MORTAL (1) 298 SSTATE_INTERNED_IMMORTAL (2) 299 300 If interned != SSTATE_NOT_INTERNED, the two references from the 301 dictionary to this object are *not* counted in ob_refcnt. 302 */ 303 unsigned int interned:2; 304 /* Character size: 305 306 - PyUnicode_WCHAR_KIND (0): 307 308 * character type = wchar_t (16 or 32 bits, depending on the 309 platform) 310 311 - PyUnicode_1BYTE_KIND (1): 312 313 * character type = Py_UCS1 (8 bits, unsigned) 314 * all characters are in the range U+0000-U+00FF (latin1) 315 * if ascii is set, all characters are in the range U+0000-U+007F 316 (ASCII), otherwise at least one character is in the range 317 U+0080-U+00FF 318 319 - PyUnicode_2BYTE_KIND (2): 320 321 * character type = Py_UCS2 (16 bits, unsigned) 322 * all characters are in the range U+0000-U+FFFF (BMP) 323 * at least one character is in the range U+0100-U+FFFF 324 325 - PyUnicode_4BYTE_KIND (4): 326 327 * character type = Py_UCS4 (32 bits, unsigned) 328 * all characters are in the range U+0000-U+10FFFF 329 * at least one character is in the range U+10000-U+10FFFF 330 */ 331 unsigned int kind:3; 332 /* Compact is with respect to the allocation scheme. Compact unicode 333 objects only require one memory block while non-compact objects use 334 one block for the PyUnicodeObject struct and another for its data 335 buffer. */ 336 unsigned int compact:1; 337 /* The string only contains characters in the range U+0000-U+007F (ASCII) 338 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is 339 set, use the PyASCIIObject structure. */ 340 unsigned int ascii:1; 341 /* The ready flag indicates whether the object layout is initialized 342 completely. This means that this is either a compact object, or 343 the data pointer is filled out. The bit is redundant, and helps 344 to minimize the test in PyUnicode_IS_READY(). */ 345 unsigned int ready:1; 346 /* Padding to ensure that PyUnicode_DATA() is always aligned to 347 4 bytes (see issue #19537 on m68k). */ 348 unsigned int :24; 349 } state; 350 wchar_t *wstr; /* wchar_t representation (null-terminated) */ 351} PyASCIIObject; 352 353/* Non-ASCII strings allocated through PyUnicode_New use the 354 PyCompactUnicodeObject structure. state.compact is set, and the data 355 immediately follow the structure. */ 356typedef struct { 357 PyASCIIObject _base; 358 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 359 * terminating \0. */ 360 char *utf8; /* UTF-8 representation (null-terminated) */ 361 Py_ssize_t wstr_length; /* Number of code points in wstr, possible 362 * surrogates count as two code points. */ 363} PyCompactUnicodeObject; 364 365/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the 366 PyUnicodeObject structure. The actual string data is initially in the wstr 367 block, and copied into the data block using _PyUnicode_Ready. */ 368typedef struct { 369 PyCompactUnicodeObject _base; 370 union { 371 void *any; 372 Py_UCS1 *latin1; 373 Py_UCS2 *ucs2; 374 Py_UCS4 *ucs4; 375 } data; /* Canonical, smallest-form Unicode buffer */ 376} PyUnicodeObject; 377#endif 378 379PyAPI_DATA(PyTypeObject) PyUnicode_Type; 380PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 381 382#define PyUnicode_Check(op) \ 383 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 384#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 385 386/* Fast access macros */ 387#ifndef Py_LIMITED_API 388 389#define PyUnicode_WSTR_LENGTH(op) \ 390 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 391 ((PyASCIIObject*)op)->length : \ 392 ((PyCompactUnicodeObject*)op)->wstr_length) 393 394/* Returns the deprecated Py_UNICODE representation's size in code units 395 (this includes surrogate pairs as 2 units). 396 If the Py_UNICODE representation is not available, it will be computed 397 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ 398 399#define PyUnicode_GET_SIZE(op) \ 400 (assert(PyUnicode_Check(op)), \ 401 (((PyASCIIObject *)(op))->wstr) ? \ 402 PyUnicode_WSTR_LENGTH(op) : \ 403 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ 404 assert(((PyASCIIObject *)(op))->wstr), \ 405 PyUnicode_WSTR_LENGTH(op))) 406 407#define PyUnicode_GET_DATA_SIZE(op) \ 408 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) 409 410/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE 411 representation on demand. Using this macro is very inefficient now, 412 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or 413 use PyUnicode_WRITE() and PyUnicode_READ(). */ 414 415#define PyUnicode_AS_UNICODE(op) \ 416 (assert(PyUnicode_Check(op)), \ 417 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ 418 PyUnicode_AsUnicode((PyObject *)(op))) 419 420#define PyUnicode_AS_DATA(op) \ 421 ((const char *)(PyUnicode_AS_UNICODE(op))) 422 423 424/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ 425 426/* Values for PyASCIIObject.state: */ 427 428/* Interning state. */ 429#define SSTATE_NOT_INTERNED 0 430#define SSTATE_INTERNED_MORTAL 1 431#define SSTATE_INTERNED_IMMORTAL 2 432 433/* Return true if the string contains only ASCII characters, or 0 if not. The 434 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be 435 ready. */ 436#define PyUnicode_IS_ASCII(op) \ 437 (assert(PyUnicode_Check(op)), \ 438 assert(PyUnicode_IS_READY(op)), \ 439 ((PyASCIIObject*)op)->state.ascii) 440 441/* Return true if the string is compact or 0 if not. 442 No type checks or Ready calls are performed. */ 443#define PyUnicode_IS_COMPACT(op) \ 444 (((PyASCIIObject*)(op))->state.compact) 445 446/* Return true if the string is a compact ASCII string (use PyASCIIObject 447 structure), or 0 if not. No type checks or Ready calls are performed. */ 448#define PyUnicode_IS_COMPACT_ASCII(op) \ 449 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op)) 450 451enum PyUnicode_Kind { 452/* String contains only wstr byte characters. This is only possible 453 when the string was created with a legacy API and _PyUnicode_Ready() 454 has not been called yet. */ 455 PyUnicode_WCHAR_KIND = 0, 456/* Return values of the PyUnicode_KIND() macro: */ 457 PyUnicode_1BYTE_KIND = 1, 458 PyUnicode_2BYTE_KIND = 2, 459 PyUnicode_4BYTE_KIND = 4 460}; 461 462/* Return pointers to the canonical representation cast to unsigned char, 463 Py_UCS2, or Py_UCS4 for direct character access. 464 No checks are performed, use PyUnicode_KIND() before to ensure 465 these will work correctly. */ 466 467#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) 468#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) 469#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) 470 471/* Return one of the PyUnicode_*_KIND values defined above. */ 472#define PyUnicode_KIND(op) \ 473 (assert(PyUnicode_Check(op)), \ 474 assert(PyUnicode_IS_READY(op)), \ 475 ((PyASCIIObject *)(op))->state.kind) 476 477/* Return a void pointer to the raw unicode buffer. */ 478#define _PyUnicode_COMPACT_DATA(op) \ 479 (PyUnicode_IS_ASCII(op) ? \ 480 ((void*)((PyASCIIObject*)(op) + 1)) : \ 481 ((void*)((PyCompactUnicodeObject*)(op) + 1))) 482 483#define _PyUnicode_NONCOMPACT_DATA(op) \ 484 (assert(((PyUnicodeObject*)(op))->data.any), \ 485 ((((PyUnicodeObject *)(op))->data.any))) 486 487#define PyUnicode_DATA(op) \ 488 (assert(PyUnicode_Check(op)), \ 489 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ 490 _PyUnicode_NONCOMPACT_DATA(op)) 491 492/* In the access macros below, "kind" may be evaluated more than once. 493 All other macro parameters are evaluated exactly once, so it is safe 494 to put side effects into them (such as increasing the index). */ 495 496/* Write into the canonical representation, this macro does not do any sanity 497 checks and is intended for usage in loops. The caller should cache the 498 kind and data pointers obtained from other macro calls. 499 index is the index in the string (starts at 0) and value is the new 500 code point value which should be written to that location. */ 501#define PyUnicode_WRITE(kind, data, index, value) \ 502 do { \ 503 switch ((kind)) { \ 504 case PyUnicode_1BYTE_KIND: { \ 505 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ 506 break; \ 507 } \ 508 case PyUnicode_2BYTE_KIND: { \ 509 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ 510 break; \ 511 } \ 512 default: { \ 513 assert((kind) == PyUnicode_4BYTE_KIND); \ 514 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ 515 } \ 516 } \ 517 } while (0) 518 519/* Read a code point from the string's canonical representation. No checks 520 or ready calls are performed. */ 521#define PyUnicode_READ(kind, data, index) \ 522 ((Py_UCS4) \ 523 ((kind) == PyUnicode_1BYTE_KIND ? \ 524 ((const Py_UCS1 *)(data))[(index)] : \ 525 ((kind) == PyUnicode_2BYTE_KIND ? \ 526 ((const Py_UCS2 *)(data))[(index)] : \ 527 ((const Py_UCS4 *)(data))[(index)] \ 528 ) \ 529 )) 530 531/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 532 calls PyUnicode_KIND() and might call it twice. For single reads, use 533 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 534 cache kind and use PyUnicode_READ instead. */ 535#define PyUnicode_READ_CHAR(unicode, index) \ 536 (assert(PyUnicode_Check(unicode)), \ 537 assert(PyUnicode_IS_READY(unicode)), \ 538 (Py_UCS4) \ 539 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ 540 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ 541 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ 542 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ 543 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ 544 ) \ 545 )) 546 547/* Returns the length of the unicode string. The caller has to make sure that 548 the string has it's canonical representation set before calling 549 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ 550#define PyUnicode_GET_LENGTH(op) \ 551 (assert(PyUnicode_Check(op)), \ 552 assert(PyUnicode_IS_READY(op)), \ 553 ((PyASCIIObject *)(op))->length) 554 555 556/* Fast check to determine whether an object is ready. Equivalent to 557 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ 558 559#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) 560 561/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best 562 case. If the canonical representation is not yet set, it will still call 563 _PyUnicode_Ready(). 564 Returns 0 on success and -1 on errors. */ 565#define PyUnicode_READY(op) \ 566 (assert(PyUnicode_Check(op)), \ 567 (PyUnicode_IS_READY(op) ? \ 568 0 : _PyUnicode_Ready((PyObject *)(op)))) 569 570/* Return a maximum character value which is suitable for creating another 571 string based on op. This is always an approximation but more efficient 572 than iterating over the string. */ 573#define PyUnicode_MAX_CHAR_VALUE(op) \ 574 (assert(PyUnicode_IS_READY(op)), \ 575 (PyUnicode_IS_ASCII(op) ? \ 576 (0x7f) : \ 577 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ 578 (0xffU) : \ 579 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ 580 (0xffffU) : \ 581 (0x10ffffU))))) 582 583#endif 584 585/* --- Constants ---------------------------------------------------------- */ 586 587/* This Unicode character will be used as replacement character during 588 decoding if the errors argument is set to "replace". Note: the 589 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 590 Unicode 3.0. */ 591 592#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 593 594/* === Public API ========================================================= */ 595 596/* --- Plain Py_UNICODE --------------------------------------------------- */ 597 598/* With PEP 393, this is the recommended way to allocate a new unicode object. 599 This function will allocate the object and its buffer in a single memory 600 block. Objects created using this function are not resizable. */ 601#ifndef Py_LIMITED_API 602PyAPI_FUNC(PyObject*) PyUnicode_New( 603 Py_ssize_t size, /* Number of code points in the new string */ 604 Py_UCS4 maxchar /* maximum code point value in the string */ 605 ); 606#endif 607 608/* Initializes the canonical string representation from the deprecated 609 wstr/Py_UNICODE representation. This function is used to convert Unicode 610 objects which were created using the old API to the new flexible format 611 introduced with PEP 393. 612 613 Don't call this function directly, use the public PyUnicode_READY() macro 614 instead. */ 615#ifndef Py_LIMITED_API 616PyAPI_FUNC(int) _PyUnicode_Ready( 617 PyObject *unicode /* Unicode object */ 618 ); 619#endif 620 621/* Get a copy of a Unicode string. */ 622#ifndef Py_LIMITED_API 623PyAPI_FUNC(PyObject*) _PyUnicode_Copy( 624 PyObject *unicode 625 ); 626#endif 627 628/* Copy character from one unicode object into another, this function performs 629 character conversion when necessary and falls back to memcpy() if possible. 630 631 Fail if to is too small (smaller than *how_many* or smaller than 632 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 633 kind(to), or if *to* has more than 1 reference. 634 635 Return the number of written character, or return -1 and raise an exception 636 on error. 637 638 Pseudo-code: 639 640 how_many = min(how_many, len(from) - from_start) 641 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 642 return how_many 643 644 Note: The function doesn't write a terminating null character. 645 */ 646#ifndef Py_LIMITED_API 647PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 648 PyObject *to, 649 Py_ssize_t to_start, 650 PyObject *from, 651 Py_ssize_t from_start, 652 Py_ssize_t how_many 653 ); 654 655/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so 656 may crash if parameters are invalid (e.g. if the output string 657 is too short). */ 658PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters( 659 PyObject *to, 660 Py_ssize_t to_start, 661 PyObject *from, 662 Py_ssize_t from_start, 663 Py_ssize_t how_many 664 ); 665#endif 666 667#ifndef Py_LIMITED_API 668/* Fill a string with a character: write fill_char into 669 unicode[start:start+length]. 670 671 Fail if fill_char is bigger than the string maximum character, or if the 672 string has more than 1 reference. 673 674 Return the number of written character, or return -1 and raise an exception 675 on error. */ 676PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( 677 PyObject *unicode, 678 Py_ssize_t start, 679 Py_ssize_t length, 680 Py_UCS4 fill_char 681 ); 682 683/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash 684 if parameters are invalid (e.g. if length is longer than the string). */ 685PyAPI_FUNC(void) _PyUnicode_FastFill( 686 PyObject *unicode, 687 Py_ssize_t start, 688 Py_ssize_t length, 689 Py_UCS4 fill_char 690 ); 691#endif 692 693/* Create a Unicode Object from the Py_UNICODE buffer u of the given 694 size. 695 696 u may be NULL which causes the contents to be undefined. It is the 697 user's responsibility to fill in the needed data afterwards. Note 698 that modifying the Unicode object contents after construction is 699 only allowed if u was set to NULL. 700 701 The buffer is copied into the new object. */ 702 703#ifndef Py_LIMITED_API 704PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 705 const Py_UNICODE *u, /* Unicode buffer */ 706 Py_ssize_t size /* size of buffer */ 707 ); 708#endif 709 710/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 711PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 712 const char *u, /* UTF-8 encoded string */ 713 Py_ssize_t size /* size of buffer */ 714 ); 715 716/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 717 UTF-8 encoded bytes. The size is determined with strlen(). */ 718PyAPI_FUNC(PyObject*) PyUnicode_FromString( 719 const char *u /* UTF-8 encoded string */ 720 ); 721 722#ifndef Py_LIMITED_API 723/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. 724 Scan the string to find the maximum character. */ 725PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 726 int kind, 727 const void *buffer, 728 Py_ssize_t size); 729 730/* Create a new string from a buffer of ASCII characters. 731 WARNING: Don't check if the string contains any non-ASCII character. */ 732PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII( 733 const char *buffer, 734 Py_ssize_t size); 735#endif 736 737PyAPI_FUNC(PyObject*) PyUnicode_Substring( 738 PyObject *str, 739 Py_ssize_t start, 740 Py_ssize_t end); 741 742#ifndef Py_LIMITED_API 743/* Compute the maximum character of the substring unicode[start:end]. 744 Return 127 for an empty string. */ 745PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( 746 PyObject *unicode, 747 Py_ssize_t start, 748 Py_ssize_t end); 749#endif 750 751/* Copy the string into a UCS4 buffer including the null character if copy_null 752 is set. Return NULL and raise an exception on error. Raise a ValueError if 753 the buffer is smaller than the string. Return buffer on success. 754 755 buflen is the length of the buffer in (Py_UCS4) characters. */ 756PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 757 PyObject *unicode, 758 Py_UCS4* buffer, 759 Py_ssize_t buflen, 760 int copy_null); 761 762/* Copy the string into a UCS4 buffer. A new buffer is allocated using 763 * PyMem_Malloc; if this fails, NULL is returned with a memory error 764 exception set. */ 765PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 766 767/* Return a read-only pointer to the Unicode object's internal 768 Py_UNICODE buffer. 769 If the wchar_t/Py_UNICODE representation is not yet available, this 770 function will calculate it. */ 771 772#ifndef Py_LIMITED_API 773PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 774 PyObject *unicode /* Unicode object */ 775 ); 776#endif 777 778/* Return a read-only pointer to the Unicode object's internal 779 Py_UNICODE buffer and save the length at size. 780 If the wchar_t/Py_UNICODE representation is not yet available, this 781 function will calculate it. */ 782 783#ifndef Py_LIMITED_API 784PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( 785 PyObject *unicode, /* Unicode object */ 786 Py_ssize_t *size /* location where to save the length */ 787 ); 788#endif 789 790/* Get the length of the Unicode object. */ 791 792PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 793 PyObject *unicode 794); 795 796/* Get the number of Py_UNICODE units in the 797 string representation. */ 798 799PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 800 PyObject *unicode /* Unicode object */ 801 ); 802 803/* Read a character from the string. */ 804 805PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 806 PyObject *unicode, 807 Py_ssize_t index 808 ); 809 810/* Write a character to the string. The string must have been created through 811 PyUnicode_New, must not be shared, and must not have been hashed yet. 812 813 Return 0 on success, -1 on error. */ 814 815PyAPI_FUNC(int) PyUnicode_WriteChar( 816 PyObject *unicode, 817 Py_ssize_t index, 818 Py_UCS4 character 819 ); 820 821#ifndef Py_LIMITED_API 822/* Get the maximum ordinal for a Unicode character. */ 823PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 824#endif 825 826/* Resize a Unicode object. The length is the number of characters, except 827 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 828 is the number of Py_UNICODE characters. 829 830 *unicode is modified to point to the new (resized) object and 0 831 returned on success. 832 833 Try to resize the string in place (which is usually faster than allocating 834 a new string and copy characters), or create a new string. 835 836 Error handling is implemented as follows: an exception is set, -1 837 is returned and *unicode left untouched. 838 839 WARNING: The function doesn't check string content, the result may not be a 840 string in canonical representation. */ 841 842PyAPI_FUNC(int) PyUnicode_Resize( 843 PyObject **unicode, /* Pointer to the Unicode object */ 844 Py_ssize_t length /* New length */ 845 ); 846 847/* Decode obj to a Unicode object. 848 849 bytes, bytearray and other bytes-like objects are decoded according to the 850 given encoding and error handler. The encoding and error handler can be 851 NULL to have the interface use UTF-8 and "strict". 852 853 All other objects (including Unicode objects) raise an exception. 854 855 The API returns NULL in case of an error. The caller is responsible 856 for decref'ing the returned objects. 857 858*/ 859 860PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 861 PyObject *obj, /* Object */ 862 const char *encoding, /* encoding */ 863 const char *errors /* error handling */ 864 ); 865 866/* Copy an instance of a Unicode subtype to a new true Unicode object if 867 necessary. If obj is already a true Unicode object (not a subtype), return 868 the reference with *incremented* refcount. 869 870 The API returns NULL in case of an error. The caller is responsible 871 for decref'ing the returned objects. 872 873*/ 874 875PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 876 PyObject *obj /* Object */ 877 ); 878 879PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 880 const char *format, /* ASCII-encoded string */ 881 va_list vargs 882 ); 883PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 884 const char *format, /* ASCII-encoded string */ 885 ... 886 ); 887 888#ifndef Py_LIMITED_API 889typedef struct { 890 PyObject *buffer; 891 void *data; 892 enum PyUnicode_Kind kind; 893 Py_UCS4 maxchar; 894 Py_ssize_t size; 895 Py_ssize_t pos; 896 897 /* minimum number of allocated characters (default: 0) */ 898 Py_ssize_t min_length; 899 900 /* minimum character (default: 127, ASCII) */ 901 Py_UCS4 min_char; 902 903 /* If non-zero, overallocate the buffer (default: 0). */ 904 unsigned char overallocate; 905 906 /* If readonly is 1, buffer is a shared string (cannot be modified) 907 and size is set to 0. */ 908 unsigned char readonly; 909} _PyUnicodeWriter ; 910 911/* Initialize a Unicode writer. 912 * 913 * By default, the minimum buffer size is 0 character and overallocation is 914 * disabled. Set min_length, min_char and overallocate attributes to control 915 * the allocation of the buffer. */ 916PyAPI_FUNC(void) 917_PyUnicodeWriter_Init(_PyUnicodeWriter *writer); 918 919/* Prepare the buffer to write 'length' characters 920 with the specified maximum character. 921 922 Return 0 on success, raise an exception and return -1 on error. */ 923#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ 924 (((MAXCHAR) <= (WRITER)->maxchar \ 925 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ 926 ? 0 \ 927 : (((LENGTH) == 0) \ 928 ? 0 \ 929 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) 930 931/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro 932 instead. */ 933PyAPI_FUNC(int) 934_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 935 Py_ssize_t length, Py_UCS4 maxchar); 936 937/* Prepare the buffer to have at least the kind KIND. 938 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will 939 support characters in range U+000-U+FFFF. 940 941 Return 0 on success, raise an exception and return -1 on error. */ 942#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ 943 (assert((KIND) != PyUnicode_WCHAR_KIND), \ 944 (KIND) <= (WRITER)->kind \ 945 ? 0 \ 946 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) 947 948/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind() 949 macro instead. */ 950PyAPI_FUNC(int) 951_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 952 enum PyUnicode_Kind kind); 953 954/* Append a Unicode character. 955 Return 0 on success, raise an exception and return -1 on error. */ 956PyAPI_FUNC(int) 957_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, 958 Py_UCS4 ch 959 ); 960 961/* Append a Unicode string. 962 Return 0 on success, raise an exception and return -1 on error. */ 963PyAPI_FUNC(int) 964_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, 965 PyObject *str /* Unicode string */ 966 ); 967 968/* Append a substring of a Unicode string. 969 Return 0 on success, raise an exception and return -1 on error. */ 970PyAPI_FUNC(int) 971_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, 972 PyObject *str, /* Unicode string */ 973 Py_ssize_t start, 974 Py_ssize_t end 975 ); 976 977/* Append an ASCII-encoded byte string. 978 Return 0 on success, raise an exception and return -1 on error. */ 979PyAPI_FUNC(int) 980_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 981 const char *str, /* ASCII-encoded byte string */ 982 Py_ssize_t len /* number of bytes, or -1 if unknown */ 983 ); 984 985/* Append a latin1-encoded byte string. 986 Return 0 on success, raise an exception and return -1 on error. */ 987PyAPI_FUNC(int) 988_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 989 const char *str, /* latin1-encoded byte string */ 990 Py_ssize_t len /* length in bytes */ 991 ); 992 993/* Get the value of the writer as a Unicode string. Clear the 994 buffer of the writer. Raise an exception and return NULL 995 on error. */ 996PyAPI_FUNC(PyObject *) 997_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); 998 999/* Deallocate memory of a writer (clear its internal buffer). */ 1000PyAPI_FUNC(void) 1001_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); 1002#endif 1003 1004#ifndef Py_LIMITED_API 1005/* Format the object based on the format_spec, as defined in PEP 3101 1006 (Advanced String Formatting). */ 1007PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter( 1008 _PyUnicodeWriter *writer, 1009 PyObject *obj, 1010 PyObject *format_spec, 1011 Py_ssize_t start, 1012 Py_ssize_t end); 1013#endif 1014 1015PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 1016PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 1017PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 1018 const char *u /* UTF-8 encoded string */ 1019 ); 1020#ifndef Py_LIMITED_API 1021PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 1022#endif 1023 1024/* Use only if you know it's a string */ 1025#define PyUnicode_CHECK_INTERNED(op) \ 1026 (((PyASCIIObject *)(op))->state.interned) 1027 1028/* --- wchar_t support for platforms which support it --------------------- */ 1029 1030#ifdef HAVE_WCHAR_H 1031 1032/* Create a Unicode Object from the wchar_t buffer w of the given 1033 size. 1034 1035 The buffer is copied into the new object. */ 1036 1037PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 1038 const wchar_t *w, /* wchar_t buffer */ 1039 Py_ssize_t size /* size of buffer */ 1040 ); 1041 1042/* Copies the Unicode Object contents into the wchar_t buffer w. At 1043 most size wchar_t characters are copied. 1044 1045 Note that the resulting wchar_t string may or may not be 1046 0-terminated. It is the responsibility of the caller to make sure 1047 that the wchar_t string is 0-terminated in case this is required by 1048 the application. 1049 1050 Returns the number of wchar_t characters copied (excluding a 1051 possibly trailing 0-termination character) or -1 in case of an 1052 error. */ 1053 1054PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 1055 PyObject *unicode, /* Unicode object */ 1056 wchar_t *w, /* wchar_t buffer */ 1057 Py_ssize_t size /* size of buffer */ 1058 ); 1059 1060/* Convert the Unicode object to a wide character string. The output string 1061 always ends with a nul character. If size is not NULL, write the number of 1062 wide characters (excluding the null character) into *size. 1063 1064 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 1065 on success. On error, returns NULL, *size is undefined and raises a 1066 MemoryError. */ 1067 1068PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 1069 PyObject *unicode, /* Unicode object */ 1070 Py_ssize_t *size /* number of characters of the result */ 1071 ); 1072 1073#ifndef Py_LIMITED_API 1074PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); 1075#endif 1076 1077#endif 1078 1079/* --- Unicode ordinals --------------------------------------------------- */ 1080 1081/* Create a Unicode Object from the given Unicode code point ordinal. 1082 1083 The ordinal must be in range(0x110000). A ValueError is 1084 raised in case it is not. 1085 1086*/ 1087 1088PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 1089 1090/* --- Free-list management ----------------------------------------------- */ 1091 1092/* Clear the free list used by the Unicode implementation. 1093 1094 This can be used to release memory used for objects on the free 1095 list back to the Python memory allocator. 1096 1097*/ 1098 1099PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 1100 1101/* === Builtin Codecs ===================================================== 1102 1103 Many of these APIs take two arguments encoding and errors. These 1104 parameters encoding and errors have the same semantics as the ones 1105 of the builtin str() API. 1106 1107 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 1108 1109 Error handling is set by errors which may also be set to NULL 1110 meaning to use the default handling defined for the codec. Default 1111 error handling for all builtin codecs is "strict" (ValueErrors are 1112 raised). 1113 1114 The codecs all use a similar interface. Only deviation from the 1115 generic ones are documented. 1116 1117*/ 1118 1119/* --- Manage the default encoding ---------------------------------------- */ 1120 1121/* Returns a pointer to the default encoding (UTF-8) of the 1122 Unicode object unicode and the size of the encoded representation 1123 in bytes stored in *size. 1124 1125 In case of an error, no *size is set. 1126 1127 This function caches the UTF-8 encoded string in the unicodeobject 1128 and subsequent calls will return the same string. The memory is released 1129 when the unicodeobject is deallocated. 1130 1131 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to 1132 support the previous internal function with the same behaviour. 1133 1134 *** This API is for interpreter INTERNAL USE ONLY and will likely 1135 *** be removed or changed in the future. 1136 1137 *** If you need to access the Unicode object as UTF-8 bytes string, 1138 *** please use PyUnicode_AsUTF8String() instead. 1139*/ 1140 1141#ifndef Py_LIMITED_API 1142PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( 1143 PyObject *unicode, 1144 Py_ssize_t *size); 1145#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize 1146#endif 1147 1148/* Returns a pointer to the default encoding (UTF-8) of the 1149 Unicode object unicode. 1150 1151 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 1152 in the unicodeobject. 1153 1154 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 1155 support the previous internal function with the same behaviour. 1156 1157 Use of this API is DEPRECATED since no size information can be 1158 extracted from the returned data. 1159 1160 *** This API is for interpreter INTERNAL USE ONLY and will likely 1161 *** be removed or changed for Python 3.1. 1162 1163 *** If you need to access the Unicode object as UTF-8 bytes string, 1164 *** please use PyUnicode_AsUTF8String() instead. 1165 1166*/ 1167 1168#ifndef Py_LIMITED_API 1169PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); 1170#define _PyUnicode_AsString PyUnicode_AsUTF8 1171#endif 1172 1173/* Returns "utf-8". */ 1174 1175PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 1176 1177/* --- Generic Codecs ----------------------------------------------------- */ 1178 1179/* Create a Unicode object by decoding the encoded string s of the 1180 given size. */ 1181 1182PyAPI_FUNC(PyObject*) PyUnicode_Decode( 1183 const char *s, /* encoded string */ 1184 Py_ssize_t size, /* size of buffer */ 1185 const char *encoding, /* encoding */ 1186 const char *errors /* error handling */ 1187 ); 1188 1189/* Decode a Unicode object unicode and return the result as Python 1190 object. */ 1191 1192PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 1193 PyObject *unicode, /* Unicode object */ 1194 const char *encoding, /* encoding */ 1195 const char *errors /* error handling */ 1196 ); 1197 1198/* Decode a Unicode object unicode and return the result as Unicode 1199 object. */ 1200 1201PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 1202 PyObject *unicode, /* Unicode object */ 1203 const char *encoding, /* encoding */ 1204 const char *errors /* error handling */ 1205 ); 1206 1207/* Encodes a Py_UNICODE buffer of the given size and returns a 1208 Python string object. */ 1209 1210#ifndef Py_LIMITED_API 1211PyAPI_FUNC(PyObject*) PyUnicode_Encode( 1212 const Py_UNICODE *s, /* Unicode char buffer */ 1213 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 1214 const char *encoding, /* encoding */ 1215 const char *errors /* error handling */ 1216 ); 1217#endif 1218 1219/* Encodes a Unicode object and returns the result as Python 1220 object. */ 1221 1222PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 1223 PyObject *unicode, /* Unicode object */ 1224 const char *encoding, /* encoding */ 1225 const char *errors /* error handling */ 1226 ); 1227 1228/* Encodes a Unicode object and returns the result as Python string 1229 object. */ 1230 1231PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 1232 PyObject *unicode, /* Unicode object */ 1233 const char *encoding, /* encoding */ 1234 const char *errors /* error handling */ 1235 ); 1236 1237/* Encodes a Unicode object and returns the result as Unicode 1238 object. */ 1239 1240PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 1241 PyObject *unicode, /* Unicode object */ 1242 const char *encoding, /* encoding */ 1243 const char *errors /* error handling */ 1244 ); 1245 1246/* Build an encoding map. */ 1247 1248PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 1249 PyObject* string /* 256 character map */ 1250 ); 1251 1252/* --- UTF-7 Codecs ------------------------------------------------------- */ 1253 1254PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 1255 const char *string, /* UTF-7 encoded string */ 1256 Py_ssize_t length, /* size of string */ 1257 const char *errors /* error handling */ 1258 ); 1259 1260PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 1261 const char *string, /* UTF-7 encoded string */ 1262 Py_ssize_t length, /* size of string */ 1263 const char *errors, /* error handling */ 1264 Py_ssize_t *consumed /* bytes consumed */ 1265 ); 1266 1267#ifndef Py_LIMITED_API 1268PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 1269 const Py_UNICODE *data, /* Unicode char buffer */ 1270 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1271 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1272 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1273 const char *errors /* error handling */ 1274 ); 1275PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( 1276 PyObject *unicode, /* Unicode object */ 1277 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1278 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1279 const char *errors /* error handling */ 1280 ); 1281#endif 1282 1283/* --- UTF-8 Codecs ------------------------------------------------------- */ 1284 1285PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 1286 const char *string, /* UTF-8 encoded string */ 1287 Py_ssize_t length, /* size of string */ 1288 const char *errors /* error handling */ 1289 ); 1290 1291PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 1292 const char *string, /* UTF-8 encoded string */ 1293 Py_ssize_t length, /* size of string */ 1294 const char *errors, /* error handling */ 1295 Py_ssize_t *consumed /* bytes consumed */ 1296 ); 1297 1298PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 1299 PyObject *unicode /* Unicode object */ 1300 ); 1301 1302#ifndef Py_LIMITED_API 1303PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 1304 PyObject *unicode, 1305 const char *errors); 1306 1307PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 1308 const Py_UNICODE *data, /* Unicode char buffer */ 1309 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1310 const char *errors /* error handling */ 1311 ); 1312#endif 1313 1314/* --- UTF-32 Codecs ------------------------------------------------------ */ 1315 1316/* Decodes length bytes from a UTF-32 encoded buffer string and returns 1317 the corresponding Unicode object. 1318 1319 errors (if non-NULL) defines the error handling. It defaults 1320 to "strict". 1321 1322 If byteorder is non-NULL, the decoder starts decoding using the 1323 given byte order: 1324 1325 *byteorder == -1: little endian 1326 *byteorder == 0: native order 1327 *byteorder == 1: big endian 1328 1329 In native mode, the first four bytes of the stream are checked for a 1330 BOM mark. If found, the BOM mark is analysed, the byte order 1331 adjusted and the BOM skipped. In the other modes, no BOM mark 1332 interpretation is done. After completion, *byteorder is set to the 1333 current byte order at the end of input data. 1334 1335 If byteorder is NULL, the codec starts in native order mode. 1336 1337*/ 1338 1339PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 1340 const char *string, /* UTF-32 encoded string */ 1341 Py_ssize_t length, /* size of string */ 1342 const char *errors, /* error handling */ 1343 int *byteorder /* pointer to byteorder to use 1344 0=native;-1=LE,1=BE; updated on 1345 exit */ 1346 ); 1347 1348PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 1349 const char *string, /* UTF-32 encoded string */ 1350 Py_ssize_t length, /* size of string */ 1351 const char *errors, /* error handling */ 1352 int *byteorder, /* pointer to byteorder to use 1353 0=native;-1=LE,1=BE; updated on 1354 exit */ 1355 Py_ssize_t *consumed /* bytes consumed */ 1356 ); 1357 1358/* Returns a Python string using the UTF-32 encoding in native byte 1359 order. The string always starts with a BOM mark. */ 1360 1361PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 1362 PyObject *unicode /* Unicode object */ 1363 ); 1364 1365/* Returns a Python string object holding the UTF-32 encoded value of 1366 the Unicode data. 1367 1368 If byteorder is not 0, output is written according to the following 1369 byte order: 1370 1371 byteorder == -1: little endian 1372 byteorder == 0: native byte order (writes a BOM mark) 1373 byteorder == 1: big endian 1374 1375 If byteorder is 0, the output string will always start with the 1376 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1377 prepended. 1378 1379*/ 1380 1381#ifndef Py_LIMITED_API 1382PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 1383 const Py_UNICODE *data, /* Unicode char buffer */ 1384 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1385 const char *errors, /* error handling */ 1386 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1387 ); 1388PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( 1389 PyObject *object, /* Unicode object */ 1390 const char *errors, /* error handling */ 1391 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1392 ); 1393#endif 1394 1395/* --- UTF-16 Codecs ------------------------------------------------------ */ 1396 1397/* Decodes length bytes from a UTF-16 encoded buffer string and returns 1398 the corresponding Unicode object. 1399 1400 errors (if non-NULL) defines the error handling. It defaults 1401 to "strict". 1402 1403 If byteorder is non-NULL, the decoder starts decoding using the 1404 given byte order: 1405 1406 *byteorder == -1: little endian 1407 *byteorder == 0: native order 1408 *byteorder == 1: big endian 1409 1410 In native mode, the first two bytes of the stream are checked for a 1411 BOM mark. If found, the BOM mark is analysed, the byte order 1412 adjusted and the BOM skipped. In the other modes, no BOM mark 1413 interpretation is done. After completion, *byteorder is set to the 1414 current byte order at the end of input data. 1415 1416 If byteorder is NULL, the codec starts in native order mode. 1417 1418*/ 1419 1420PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 1421 const char *string, /* UTF-16 encoded string */ 1422 Py_ssize_t length, /* size of string */ 1423 const char *errors, /* error handling */ 1424 int *byteorder /* pointer to byteorder to use 1425 0=native;-1=LE,1=BE; updated on 1426 exit */ 1427 ); 1428 1429PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 1430 const char *string, /* UTF-16 encoded string */ 1431 Py_ssize_t length, /* size of string */ 1432 const char *errors, /* error handling */ 1433 int *byteorder, /* pointer to byteorder to use 1434 0=native;-1=LE,1=BE; updated on 1435 exit */ 1436 Py_ssize_t *consumed /* bytes consumed */ 1437 ); 1438 1439/* Returns a Python string using the UTF-16 encoding in native byte 1440 order. The string always starts with a BOM mark. */ 1441 1442PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 1443 PyObject *unicode /* Unicode object */ 1444 ); 1445 1446/* Returns a Python string object holding the UTF-16 encoded value of 1447 the Unicode data. 1448 1449 If byteorder is not 0, output is written according to the following 1450 byte order: 1451 1452 byteorder == -1: little endian 1453 byteorder == 0: native byte order (writes a BOM mark) 1454 byteorder == 1: big endian 1455 1456 If byteorder is 0, the output string will always start with the 1457 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1458 prepended. 1459 1460 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 1461 UCS-2. This trick makes it possible to add full UTF-16 capabilities 1462 at a later point without compromising the APIs. 1463 1464*/ 1465 1466#ifndef Py_LIMITED_API 1467PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 1468 const Py_UNICODE *data, /* Unicode char buffer */ 1469 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1470 const char *errors, /* error handling */ 1471 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1472 ); 1473PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( 1474 PyObject* unicode, /* Unicode object */ 1475 const char *errors, /* error handling */ 1476 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1477 ); 1478#endif 1479 1480/* --- Unicode-Escape Codecs ---------------------------------------------- */ 1481 1482PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1483 const char *string, /* Unicode-Escape encoded string */ 1484 Py_ssize_t length, /* size of string */ 1485 const char *errors /* error handling */ 1486 ); 1487 1488PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1489 PyObject *unicode /* Unicode object */ 1490 ); 1491 1492#ifndef Py_LIMITED_API 1493PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1494 const Py_UNICODE *data, /* Unicode char buffer */ 1495 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1496 ); 1497#endif 1498 1499/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1500 1501PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1502 const char *string, /* Raw-Unicode-Escape encoded string */ 1503 Py_ssize_t length, /* size of string */ 1504 const char *errors /* error handling */ 1505 ); 1506 1507PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1508 PyObject *unicode /* Unicode object */ 1509 ); 1510 1511#ifndef Py_LIMITED_API 1512PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1513 const Py_UNICODE *data, /* Unicode char buffer */ 1514 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1515 ); 1516#endif 1517 1518/* --- Unicode Internal Codec --------------------------------------------- 1519 1520 Only for internal use in _codecsmodule.c */ 1521 1522#ifndef Py_LIMITED_API 1523PyObject *_PyUnicode_DecodeUnicodeInternal( 1524 const char *string, 1525 Py_ssize_t length, 1526 const char *errors 1527 ); 1528#endif 1529 1530/* --- Latin-1 Codecs ----------------------------------------------------- 1531 1532 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1533 1534*/ 1535 1536PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1537 const char *string, /* Latin-1 encoded string */ 1538 Py_ssize_t length, /* size of string */ 1539 const char *errors /* error handling */ 1540 ); 1541 1542PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1543 PyObject *unicode /* Unicode object */ 1544 ); 1545 1546#ifndef Py_LIMITED_API 1547PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 1548 PyObject* unicode, 1549 const char* errors); 1550 1551PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1552 const Py_UNICODE *data, /* Unicode char buffer */ 1553 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1554 const char *errors /* error handling */ 1555 ); 1556#endif 1557 1558/* --- ASCII Codecs ------------------------------------------------------- 1559 1560 Only 7-bit ASCII data is excepted. All other codes generate errors. 1561 1562*/ 1563 1564PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1565 const char *string, /* ASCII encoded string */ 1566 Py_ssize_t length, /* size of string */ 1567 const char *errors /* error handling */ 1568 ); 1569 1570PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1571 PyObject *unicode /* Unicode object */ 1572 ); 1573 1574#ifndef Py_LIMITED_API 1575PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 1576 PyObject* unicode, 1577 const char* errors); 1578 1579PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1580 const Py_UNICODE *data, /* Unicode char buffer */ 1581 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1582 const char *errors /* error handling */ 1583 ); 1584#endif 1585 1586/* --- Character Map Codecs ----------------------------------------------- 1587 1588 This codec uses mappings to encode and decode characters. 1589 1590 Decoding mappings must map single string characters to single 1591 Unicode characters, integers (which are then interpreted as Unicode 1592 ordinals) or None (meaning "undefined mapping" and causing an 1593 error). 1594 1595 Encoding mappings must map single Unicode characters to single 1596 string characters, integers (which are then interpreted as Latin-1 1597 ordinals) or None (meaning "undefined mapping" and causing an 1598 error). 1599 1600 If a character lookup fails with a LookupError, the character is 1601 copied as-is meaning that its ordinal value will be interpreted as 1602 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1603 to contain those mappings which map characters to different code 1604 points. 1605 1606*/ 1607 1608PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1609 const char *string, /* Encoded string */ 1610 Py_ssize_t length, /* size of string */ 1611 PyObject *mapping, /* character mapping 1612 (char ordinal -> unicode ordinal) */ 1613 const char *errors /* error handling */ 1614 ); 1615 1616PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1617 PyObject *unicode, /* Unicode object */ 1618 PyObject *mapping /* character mapping 1619 (unicode ordinal -> char ordinal) */ 1620 ); 1621 1622#ifndef Py_LIMITED_API 1623PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1624 const Py_UNICODE *data, /* Unicode char buffer */ 1625 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1626 PyObject *mapping, /* character mapping 1627 (unicode ordinal -> char ordinal) */ 1628 const char *errors /* error handling */ 1629 ); 1630PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( 1631 PyObject *unicode, /* Unicode object */ 1632 PyObject *mapping, /* character mapping 1633 (unicode ordinal -> char ordinal) */ 1634 const char *errors /* error handling */ 1635 ); 1636#endif 1637 1638/* Translate a Py_UNICODE buffer of the given length by applying a 1639 character mapping table to it and return the resulting Unicode 1640 object. 1641 1642 The mapping table must map Unicode ordinal integers to Unicode 1643 ordinal integers or None (causing deletion of the character). 1644 1645 Mapping tables may be dictionaries or sequences. Unmapped character 1646 ordinals (ones which cause a LookupError) are left untouched and 1647 are copied as-is. 1648 1649*/ 1650 1651#ifndef Py_LIMITED_API 1652PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1653 const Py_UNICODE *data, /* Unicode char buffer */ 1654 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1655 PyObject *table, /* Translate table */ 1656 const char *errors /* error handling */ 1657 ); 1658#endif 1659 1660#ifdef HAVE_MBCS 1661 1662/* --- MBCS codecs for Windows -------------------------------------------- */ 1663 1664PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1665 const char *string, /* MBCS encoded string */ 1666 Py_ssize_t length, /* size of string */ 1667 const char *errors /* error handling */ 1668 ); 1669 1670PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1671 const char *string, /* MBCS encoded string */ 1672 Py_ssize_t length, /* size of string */ 1673 const char *errors, /* error handling */ 1674 Py_ssize_t *consumed /* bytes consumed */ 1675 ); 1676 1677PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 1678 int code_page, /* code page number */ 1679 const char *string, /* encoded string */ 1680 Py_ssize_t length, /* size of string */ 1681 const char *errors, /* error handling */ 1682 Py_ssize_t *consumed /* bytes consumed */ 1683 ); 1684 1685PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1686 PyObject *unicode /* Unicode object */ 1687 ); 1688 1689#ifndef Py_LIMITED_API 1690PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1691 const Py_UNICODE *data, /* Unicode char buffer */ 1692 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1693 const char *errors /* error handling */ 1694 ); 1695#endif 1696 1697PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 1698 int code_page, /* code page number */ 1699 PyObject *unicode, /* Unicode object */ 1700 const char *errors /* error handling */ 1701 ); 1702 1703#endif /* HAVE_MBCS */ 1704 1705/* --- Decimal Encoder ---------------------------------------------------- */ 1706 1707/* Takes a Unicode string holding a decimal value and writes it into 1708 an output buffer using standard ASCII digit codes. 1709 1710 The output buffer has to provide at least length+1 bytes of storage 1711 area. The output string is 0-terminated. 1712 1713 The encoder converts whitespace to ' ', decimal characters to their 1714 corresponding ASCII digit and all other Latin-1 characters except 1715 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1716 are treated as errors. This includes embedded NULL bytes. 1717 1718 Error handling is defined by the errors argument: 1719 1720 NULL or "strict": raise a ValueError 1721 "ignore": ignore the wrong characters (these are not copied to the 1722 output buffer) 1723 "replace": replaces illegal characters with '?' 1724 1725 Returns 0 on success, -1 on failure. 1726 1727*/ 1728 1729#ifndef Py_LIMITED_API 1730PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1731 Py_UNICODE *s, /* Unicode buffer */ 1732 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1733 char *output, /* Output buffer; must have size >= length */ 1734 const char *errors /* error handling */ 1735 ); 1736#endif 1737 1738/* Transforms code points that have decimal digit property to the 1739 corresponding ASCII digit code points. 1740 1741 Returns a new Unicode string on success, NULL on failure. 1742*/ 1743 1744#ifndef Py_LIMITED_API 1745PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( 1746 Py_UNICODE *s, /* Unicode buffer */ 1747 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ 1748 ); 1749#endif 1750 1751/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject 1752 as argument instead of a raw buffer and length. This function additionally 1753 transforms spaces to ASCII because this is what the callers in longobject, 1754 floatobject, and complexobject did anyways. */ 1755 1756#ifndef Py_LIMITED_API 1757PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 1758 PyObject *unicode /* Unicode object */ 1759 ); 1760#endif 1761 1762/* --- Locale encoding --------------------------------------------------- */ 1763 1764/* Decode a string from the current locale encoding. The decoder is strict if 1765 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 1766 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 1767 be decoded as a surrogate character and *surrogateescape* is not equal to 1768 zero, the byte sequence is escaped using the 'surrogateescape' error handler 1769 instead of being decoded. *str* must end with a null character but cannot 1770 contain embedded null characters. */ 1771 1772PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 1773 const char *str, 1774 Py_ssize_t len, 1775 const char *errors); 1776 1777/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 1778 length using strlen(). */ 1779 1780PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 1781 const char *str, 1782 const char *errors); 1783 1784/* Encode a Unicode object to the current locale encoding. The encoder is 1785 strict is *surrogateescape* is equal to zero, otherwise the 1786 "surrogateescape" error handler is used. Return a bytes object. The string 1787 cannot contain embedded null characters. */ 1788 1789PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 1790 PyObject *unicode, 1791 const char *errors 1792 ); 1793 1794/* --- File system encoding ---------------------------------------------- */ 1795 1796/* ParseTuple converter: encode str objects to bytes using 1797 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1798 1799PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1800 1801/* ParseTuple converter: decode bytes objects to unicode using 1802 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1803 1804PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1805 1806/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1807 and the "surrogateescape" error handler. 1808 1809 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1810 encoding. 1811 1812 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1813*/ 1814 1815PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1816 const char *s /* encoded string */ 1817 ); 1818 1819/* Decode a string using Py_FileSystemDefaultEncoding 1820 and the "surrogateescape" error handler. 1821 1822 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1823 encoding. 1824*/ 1825 1826PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1827 const char *s, /* encoded string */ 1828 Py_ssize_t size /* size */ 1829 ); 1830 1831/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1832 "surrogateescape" error handler, and return bytes. 1833 1834 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1835 encoding. 1836*/ 1837 1838PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1839 PyObject *unicode 1840 ); 1841 1842/* --- Methods & Slots ---------------------------------------------------- 1843 1844 These are capable of handling Unicode objects and strings on input 1845 (we refer to them as strings in the descriptions) and return 1846 Unicode objects or integers as appropriate. */ 1847 1848/* Concat two strings giving a new Unicode string. */ 1849 1850PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1851 PyObject *left, /* Left string */ 1852 PyObject *right /* Right string */ 1853 ); 1854 1855/* Concat two strings and put the result in *pleft 1856 (sets *pleft to NULL on error) */ 1857 1858PyAPI_FUNC(void) PyUnicode_Append( 1859 PyObject **pleft, /* Pointer to left string */ 1860 PyObject *right /* Right string */ 1861 ); 1862 1863/* Concat two strings, put the result in *pleft and drop the right object 1864 (sets *pleft to NULL on error) */ 1865 1866PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1867 PyObject **pleft, /* Pointer to left string */ 1868 PyObject *right /* Right string */ 1869 ); 1870 1871/* Split a string giving a list of Unicode strings. 1872 1873 If sep is NULL, splitting will be done at all whitespace 1874 substrings. Otherwise, splits occur at the given separator. 1875 1876 At most maxsplit splits will be done. If negative, no limit is set. 1877 1878 Separators are not included in the resulting list. 1879 1880*/ 1881 1882PyAPI_FUNC(PyObject*) PyUnicode_Split( 1883 PyObject *s, /* String to split */ 1884 PyObject *sep, /* String separator */ 1885 Py_ssize_t maxsplit /* Maxsplit count */ 1886 ); 1887 1888/* Dito, but split at line breaks. 1889 1890 CRLF is considered to be one line break. Line breaks are not 1891 included in the resulting list. */ 1892 1893PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1894 PyObject *s, /* String to split */ 1895 int keepends /* If true, line end markers are included */ 1896 ); 1897 1898/* Partition a string using a given separator. */ 1899 1900PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1901 PyObject *s, /* String to partition */ 1902 PyObject *sep /* String separator */ 1903 ); 1904 1905/* Partition a string using a given separator, searching from the end of the 1906 string. */ 1907 1908PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1909 PyObject *s, /* String to partition */ 1910 PyObject *sep /* String separator */ 1911 ); 1912 1913/* Split a string giving a list of Unicode strings. 1914 1915 If sep is NULL, splitting will be done at all whitespace 1916 substrings. Otherwise, splits occur at the given separator. 1917 1918 At most maxsplit splits will be done. But unlike PyUnicode_Split 1919 PyUnicode_RSplit splits from the end of the string. If negative, 1920 no limit is set. 1921 1922 Separators are not included in the resulting list. 1923 1924*/ 1925 1926PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1927 PyObject *s, /* String to split */ 1928 PyObject *sep, /* String separator */ 1929 Py_ssize_t maxsplit /* Maxsplit count */ 1930 ); 1931 1932/* Translate a string by applying a character mapping table to it and 1933 return the resulting Unicode object. 1934 1935 The mapping table must map Unicode ordinal integers to Unicode 1936 ordinal integers or None (causing deletion of the character). 1937 1938 Mapping tables may be dictionaries or sequences. Unmapped character 1939 ordinals (ones which cause a LookupError) are left untouched and 1940 are copied as-is. 1941 1942*/ 1943 1944PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1945 PyObject *str, /* String */ 1946 PyObject *table, /* Translate table */ 1947 const char *errors /* error handling */ 1948 ); 1949 1950/* Join a sequence of strings using the given separator and return 1951 the resulting Unicode string. */ 1952 1953PyAPI_FUNC(PyObject*) PyUnicode_Join( 1954 PyObject *separator, /* Separator string */ 1955 PyObject *seq /* Sequence object */ 1956 ); 1957 1958/* Return 1 if substr matches str[start:end] at the given tail end, 0 1959 otherwise. */ 1960 1961PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1962 PyObject *str, /* String */ 1963 PyObject *substr, /* Prefix or Suffix string */ 1964 Py_ssize_t start, /* Start index */ 1965 Py_ssize_t end, /* Stop index */ 1966 int direction /* Tail end: -1 prefix, +1 suffix */ 1967 ); 1968 1969/* Return the first position of substr in str[start:end] using the 1970 given search direction or -1 if not found. -2 is returned in case 1971 an error occurred and an exception is set. */ 1972 1973PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1974 PyObject *str, /* String */ 1975 PyObject *substr, /* Substring to find */ 1976 Py_ssize_t start, /* Start index */ 1977 Py_ssize_t end, /* Stop index */ 1978 int direction /* Find direction: +1 forward, -1 backward */ 1979 ); 1980 1981/* Like PyUnicode_Find, but search for single character only. */ 1982PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 1983 PyObject *str, 1984 Py_UCS4 ch, 1985 Py_ssize_t start, 1986 Py_ssize_t end, 1987 int direction 1988 ); 1989 1990/* Count the number of occurrences of substr in str[start:end]. */ 1991 1992PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1993 PyObject *str, /* String */ 1994 PyObject *substr, /* Substring to count */ 1995 Py_ssize_t start, /* Start index */ 1996 Py_ssize_t end /* Stop index */ 1997 ); 1998 1999/* Replace at most maxcount occurrences of substr in str with replstr 2000 and return the resulting Unicode object. */ 2001 2002PyAPI_FUNC(PyObject *) PyUnicode_Replace( 2003 PyObject *str, /* String */ 2004 PyObject *substr, /* Substring to find */ 2005 PyObject *replstr, /* Substring to replace */ 2006 Py_ssize_t maxcount /* Max. number of replacements to apply; 2007 -1 = all */ 2008 ); 2009 2010/* Compare two strings and return -1, 0, 1 for less than, equal, 2011 greater than resp. 2012 Raise an exception and return -1 on error. */ 2013 2014PyAPI_FUNC(int) PyUnicode_Compare( 2015 PyObject *left, /* Left string */ 2016 PyObject *right /* Right string */ 2017 ); 2018 2019#ifndef Py_LIMITED_API 2020PyAPI_FUNC(int) _PyUnicode_CompareWithId( 2021 PyObject *left, /* Left string */ 2022 _Py_Identifier *right /* Right identifier */ 2023 ); 2024#endif 2025 2026PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 2027 PyObject *left, 2028 const char *right /* ASCII-encoded string */ 2029 ); 2030 2031/* Rich compare two strings and return one of the following: 2032 2033 - NULL in case an exception was raised 2034 - Py_True or Py_False for successful comparisons 2035 - Py_NotImplemented in case the type combination is unknown 2036 2037 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 2038 case the conversion of the arguments to Unicode fails with a 2039 UnicodeDecodeError. 2040 2041 Possible values for op: 2042 2043 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 2044 2045*/ 2046 2047PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 2048 PyObject *left, /* Left string */ 2049 PyObject *right, /* Right string */ 2050 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 2051 ); 2052 2053/* Apply an argument tuple or dictionary to a format string and return 2054 the resulting Unicode string. */ 2055 2056PyAPI_FUNC(PyObject *) PyUnicode_Format( 2057 PyObject *format, /* Format string */ 2058 PyObject *args /* Argument tuple or dictionary */ 2059 ); 2060 2061/* Checks whether element is contained in container and return 1/0 2062 accordingly. 2063 2064 element has to coerce to a one element Unicode string. -1 is 2065 returned in case of an error. */ 2066 2067PyAPI_FUNC(int) PyUnicode_Contains( 2068 PyObject *container, /* Container string */ 2069 PyObject *element /* Element string */ 2070 ); 2071 2072/* Checks whether argument is a valid identifier. */ 2073 2074PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 2075 2076#ifndef Py_LIMITED_API 2077/* Externally visible for str.strip(unicode) */ 2078PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 2079 PyObject *self, 2080 int striptype, 2081 PyObject *sepobj 2082 ); 2083#endif 2084 2085/* Using explicit passed-in values, insert the thousands grouping 2086 into the string pointed to by buffer. For the argument descriptions, 2087 see Objects/stringlib/localeutil.h */ 2088#ifndef Py_LIMITED_API 2089PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 2090 PyObject *unicode, 2091 Py_ssize_t index, 2092 Py_ssize_t n_buffer, 2093 void *digits, 2094 Py_ssize_t n_digits, 2095 Py_ssize_t min_width, 2096 const char *grouping, 2097 PyObject *thousands_sep, 2098 Py_UCS4 *maxchar); 2099#endif 2100/* === Characters Type APIs =============================================== */ 2101 2102/* Helper array used by Py_UNICODE_ISSPACE(). */ 2103 2104#ifndef Py_LIMITED_API 2105PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 2106 2107/* These should not be used directly. Use the Py_UNICODE_IS* and 2108 Py_UNICODE_TO* macros instead. 2109 2110 These APIs are implemented in Objects/unicodectype.c. 2111 2112*/ 2113 2114PyAPI_FUNC(int) _PyUnicode_IsLowercase( 2115 Py_UCS4 ch /* Unicode character */ 2116 ); 2117 2118PyAPI_FUNC(int) _PyUnicode_IsUppercase( 2119 Py_UCS4 ch /* Unicode character */ 2120 ); 2121 2122PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 2123 Py_UCS4 ch /* Unicode character */ 2124 ); 2125 2126PyAPI_FUNC(int) _PyUnicode_IsXidStart( 2127 Py_UCS4 ch /* Unicode character */ 2128 ); 2129 2130PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 2131 Py_UCS4 ch /* Unicode character */ 2132 ); 2133 2134PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 2135 const Py_UCS4 ch /* Unicode character */ 2136 ); 2137 2138PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 2139 const Py_UCS4 ch /* Unicode character */ 2140 ); 2141 2142PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 2143 Py_UCS4 ch /* Unicode character */ 2144 ); 2145 2146PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 2147 Py_UCS4 ch /* Unicode character */ 2148 ); 2149 2150PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 2151 Py_UCS4 ch /* Unicode character */ 2152 ); 2153 2154PyAPI_FUNC(int) _PyUnicode_ToLowerFull( 2155 Py_UCS4 ch, /* Unicode character */ 2156 Py_UCS4 *res 2157 ); 2158 2159PyAPI_FUNC(int) _PyUnicode_ToTitleFull( 2160 Py_UCS4 ch, /* Unicode character */ 2161 Py_UCS4 *res 2162 ); 2163 2164PyAPI_FUNC(int) _PyUnicode_ToUpperFull( 2165 Py_UCS4 ch, /* Unicode character */ 2166 Py_UCS4 *res 2167 ); 2168 2169PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( 2170 Py_UCS4 ch, /* Unicode character */ 2171 Py_UCS4 *res 2172 ); 2173 2174PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( 2175 Py_UCS4 ch /* Unicode character */ 2176 ); 2177 2178PyAPI_FUNC(int) _PyUnicode_IsCased( 2179 Py_UCS4 ch /* Unicode character */ 2180 ); 2181 2182PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 2183 Py_UCS4 ch /* Unicode character */ 2184 ); 2185 2186PyAPI_FUNC(int) _PyUnicode_ToDigit( 2187 Py_UCS4 ch /* Unicode character */ 2188 ); 2189 2190PyAPI_FUNC(double) _PyUnicode_ToNumeric( 2191 Py_UCS4 ch /* Unicode character */ 2192 ); 2193 2194PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 2195 Py_UCS4 ch /* Unicode character */ 2196 ); 2197 2198PyAPI_FUNC(int) _PyUnicode_IsDigit( 2199 Py_UCS4 ch /* Unicode character */ 2200 ); 2201 2202PyAPI_FUNC(int) _PyUnicode_IsNumeric( 2203 Py_UCS4 ch /* Unicode character */ 2204 ); 2205 2206PyAPI_FUNC(int) _PyUnicode_IsPrintable( 2207 Py_UCS4 ch /* Unicode character */ 2208 ); 2209 2210PyAPI_FUNC(int) _PyUnicode_IsAlpha( 2211 Py_UCS4 ch /* Unicode character */ 2212 ); 2213 2214PyAPI_FUNC(size_t) Py_UNICODE_strlen( 2215 const Py_UNICODE *u 2216 ); 2217 2218PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 2219 Py_UNICODE *s1, 2220 const Py_UNICODE *s2); 2221 2222PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 2223 Py_UNICODE *s1, const Py_UNICODE *s2); 2224 2225PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 2226 Py_UNICODE *s1, 2227 const Py_UNICODE *s2, 2228 size_t n); 2229 2230PyAPI_FUNC(int) Py_UNICODE_strcmp( 2231 const Py_UNICODE *s1, 2232 const Py_UNICODE *s2 2233 ); 2234 2235PyAPI_FUNC(int) Py_UNICODE_strncmp( 2236 const Py_UNICODE *s1, 2237 const Py_UNICODE *s2, 2238 size_t n 2239 ); 2240 2241PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 2242 const Py_UNICODE *s, 2243 Py_UNICODE c 2244 ); 2245 2246PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 2247 const Py_UNICODE *s, 2248 Py_UNICODE c 2249 ); 2250 2251PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int); 2252 2253/* Create a copy of a unicode string ending with a nul character. Return NULL 2254 and raise a MemoryError exception on memory allocation failure, otherwise 2255 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 2256 2257PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 2258 PyObject *unicode 2259 ); 2260#endif /* Py_LIMITED_API */ 2261 2262#if defined(Py_DEBUG) && !defined(Py_LIMITED_API) 2263PyAPI_FUNC(int) _PyUnicode_CheckConsistency( 2264 PyObject *op, 2265 int check_content); 2266#endif 2267 2268/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ 2269PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); 2270/* Clear all static strings. */ 2271PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); 2272 2273/* Fast equality check when the inputs are known to be exact unicode types 2274 and where the hash values are equal (i.e. a very probable match) */ 2275PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); 2276 2277#ifdef __cplusplus 2278} 2279#endif 2280#endif /* !Py_UNICODEOBJECT_H */ 2281