unicodeobject.h revision a3b334da6dd0477e5bf144934d184bc0b3e3779b
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal. (See 11http://www.egenix.com/files/python/unicode-proposal.txt). 12 13Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58#include <ctype.h> 59 60/* === Internal API ======================================================= */ 61 62/* --- Internal Unicode Format -------------------------------------------- */ 63 64/* Python 3.x requires unicode */ 65#define Py_USING_UNICODE 66 67#ifndef SIZEOF_WCHAR_T 68#error Must define SIZEOF_WCHAR_T 69#endif 70 71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Py_UNICODE was the native Unicode storage format (code unit) used by 87 Python and represents a single Unicode element in the Unicode type. 88 With PEP 393, Py_UNICODE is deprected and replaced with a 89 typedef to wchar_t. */ 90 91#ifndef Py_LIMITED_API 92#define PY_UNICODE_TYPE wchar_t 93typedef wchar_t Py_UNICODE; 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar(), 98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#if defined(MS_WINDOWS) 107# define HAVE_MBCS 108#endif 109 110#ifdef HAVE_WCHAR_H 111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 112# ifdef _HAVE_BSDI 113# include <time.h> 114# endif 115# include <wchar.h> 116#endif 117 118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve 119 unicode representations. */ 120#if SIZEOF_INT >= 4 121typedef unsigned int Py_UCS4; 122#elif SIZEOF_LONG >= 4 123typedef unsigned long Py_UCS4; 124#else 125#error "Could not find a proper typedef for Py_UCS4" 126#endif 127 128typedef unsigned short Py_UCS2; 129typedef unsigned char Py_UCS1; 130 131/* --- Internal Unicode Operations ---------------------------------------- */ 132 133/* Since splitting on whitespace is an important use case, and 134 whitespace in most situations is solely ASCII whitespace, we 135 optimize for the common case by using a quick look-up table 136 _Py_ascii_whitespace (see below) with an inlined check. 137 138 */ 139#ifndef Py_LIMITED_API 140#define Py_UNICODE_ISSPACE(ch) \ 141 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 142 143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 147 148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 151 152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 156 157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 160 161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 162 163#define Py_UNICODE_ISALNUM(ch) \ 164 (Py_UNICODE_ISALPHA(ch) || \ 165 Py_UNICODE_ISDECIMAL(ch) || \ 166 Py_UNICODE_ISDIGIT(ch) || \ 167 Py_UNICODE_ISNUMERIC(ch)) 168 169#define Py_UNICODE_COPY(target, source, length) \ 170 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 171 172#define Py_UNICODE_FILL(target, value, length) \ 173 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 174 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 175 } while (0) 176 177/* macros to work with surrogates */ 178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF) 179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 181/* Join two surrogate characters and return a single Py_UCS4 value. */ 182#define Py_UNICODE_JOIN_SURROGATES(high, low) \ 183 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 184 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 185 186/* Check if substring matches at given offset. The offset must be 187 valid, and the substring must not be empty. */ 188 189#define Py_UNICODE_MATCH(string, offset, substring) \ 190 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ 191 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ 192 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) 193 194#endif /* Py_LIMITED_API */ 195 196#ifdef __cplusplus 197extern "C" { 198#endif 199 200/* --- Unicode Type ------------------------------------------------------- */ 201 202#ifndef Py_LIMITED_API 203 204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 205 structure. state.ascii and state.compact are set, and the data 206 immediately follow the structure. utf8_length and wstr_length can be found 207 in the length field; the utf8 pointer is equal to the data pointer. */ 208typedef struct { 209 /* Unicode strings can be in 4 states: 210 211 - compact ascii: 212 213 * structure = PyASCIIObject 214 * kind = PyUnicode_1BYTE_KIND 215 * compact = 1 216 * ascii = 1 217 * ready = 1 218 * utf8 = data 219 220 - compact: 221 222 * structure = PyCompactUnicodeObject 223 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 224 PyUnicode_4BYTE_KIND 225 * compact = 1 226 * ready = 1 227 * ascii = 0 228 229 - string created by the legacy API (not ready): 230 231 * structure = PyUnicodeObject 232 * kind = PyUnicode_WCHAR_KIND 233 * compact = 0 234 * ready = 0 235 * wstr is not NULL 236 * data.any is NULL 237 * utf8 is NULL 238 * interned = SSTATE_NOT_INTERNED 239 * ascii = 0 240 241 - string created by the legacy API, ready: 242 243 * structure = PyUnicodeObject structure 244 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 245 PyUnicode_4BYTE_KIND 246 * compact = 0 247 * ready = 1 248 * data.any is not NULL 249 250 String created by the legacy API becomes ready when calling 251 PyUnicode_READY(). 252 253 See also _PyUnicode_CheckConsistency(). */ 254 PyObject_HEAD 255 Py_ssize_t length; /* Number of code points in the string */ 256 Py_hash_t hash; /* Hash value; -1 if not set */ 257 struct { 258 /* 259 SSTATE_NOT_INTERNED (0) 260 SSTATE_INTERNED_MORTAL (1) 261 SSTATE_INTERNED_IMMORTAL (2) 262 263 If interned != SSTATE_NOT_INTERNED, the two references from the 264 dictionary to this object are *not* counted in ob_refcnt. 265 */ 266 unsigned int interned:2; 267 /* Character size: 268 269 PyUnicode_WCHAR_KIND (0): wchar_t* 270 PyUnicode_1BYTE_KIND (1): Py_UCS1* 271 PyUnicode_2BYTE_KIND (2): Py_UCS2* 272 PyUnicode_4BYTE_KIND (3): Py_UCS4* 273 */ 274 unsigned int kind:2; 275 /* Compact is with respect to the allocation scheme. Compact unicode 276 objects only require one memory block while non-compact objects use 277 one block for the PyUnicodeObject struct and another for its data 278 buffer. */ 279 unsigned int compact:1; 280 /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII 281 characters. If ascii is 1 and compact is 1, use the PyASCIIObject 282 structure. */ 283 unsigned int ascii:1; 284 /* The ready flag indicates whether the object layout is initialized 285 completely. This means that this is either a compact object, or 286 the data pointer is filled out. The bit is redundant, and helps 287 to minimize the test in PyUnicode_IS_READY(). */ 288 unsigned int ready:1; 289 } state; 290 wchar_t *wstr; /* wchar_t representation (null-terminated) */ 291} PyASCIIObject; 292 293/* Non-ASCII strings allocated through PyUnicode_New use the 294 PyCompactUnicodeOject structure. state.compact is set, and the data 295 immediately follow the structure. */ 296typedef struct { 297 PyASCIIObject _base; 298 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 299 * terminating \0. */ 300 char *utf8; /* UTF-8 representation (null-terminated) */ 301 Py_ssize_t wstr_length; /* Number of code points in wstr, possible 302 * surrogates count as two code points. */ 303} PyCompactUnicodeObject; 304 305/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the 306 PyUnicodeObject structure. The actual string data is initially in the wstr 307 block, and copied into the data block using _PyUnicode_Ready. */ 308typedef struct { 309 PyCompactUnicodeObject _base; 310 union { 311 void *any; 312 Py_UCS1 *latin1; 313 Py_UCS2 *ucs2; 314 Py_UCS4 *ucs4; 315 } data; /* Canonical, smallest-form Unicode buffer */ 316} PyUnicodeObject; 317#endif 318 319PyAPI_DATA(PyTypeObject) PyUnicode_Type; 320PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 321 322#define PyUnicode_Check(op) \ 323 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 324#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 325 326/* Fast access macros */ 327#ifndef Py_LIMITED_API 328 329#define PyUnicode_WSTR_LENGTH(op) \ 330 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 331 ((PyASCIIObject*)op)->length : \ 332 ((PyCompactUnicodeObject*)op)->wstr_length) 333 334/* Returns the deprecated Py_UNICODE representation's size in code units 335 (this includes surrogate pairs as 2 units). 336 If the Py_UNICODE representation is not available, it will be computed 337 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ 338 339#define PyUnicode_GET_SIZE(op) \ 340 (assert(PyUnicode_Check(op)), \ 341 (((PyASCIIObject *)(op))->wstr) ? \ 342 PyUnicode_WSTR_LENGTH(op) : \ 343 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ 344 PyUnicode_WSTR_LENGTH(op))) 345 346#define PyUnicode_GET_DATA_SIZE(op) \ 347 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) 348 349/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE 350 representation on demand. Using this macro is very inefficient now, 351 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or 352 use PyUnicode_WRITE() and PyUnicode_READ(). */ 353 354#define PyUnicode_AS_UNICODE(op) \ 355 (assert(PyUnicode_Check(op)), \ 356 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ 357 PyUnicode_AsUnicode((PyObject *)(op))) 358 359#define PyUnicode_AS_DATA(op) \ 360 ((const char *)(PyUnicode_AS_UNICODE(op))) 361 362 363/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */ 364 365/* Values for PyUnicodeObject.state: */ 366 367/* Interning state. */ 368#define SSTATE_NOT_INTERNED 0 369#define SSTATE_INTERNED_MORTAL 1 370#define SSTATE_INTERNED_IMMORTAL 2 371 372/* Return true if the string contains only ASCII characters, or 0 if not. The 373 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks 374 or Ready calls are performed. */ 375#define PyUnicode_IS_ASCII(op) \ 376 (((PyASCIIObject*)op)->state.ascii) 377 378/* Return true if the string is compact or 0 if not. 379 No type checks or Ready calls are performed. */ 380#define PyUnicode_IS_COMPACT(op) \ 381 (((PyASCIIObject*)(op))->state.compact) 382 383/* Return true if the string is a compact ASCII string (use PyASCIIObject 384 structure), or 0 if not. No type checks or Ready calls are performed. */ 385#define PyUnicode_IS_COMPACT_ASCII(op) \ 386 (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op)) 387 388/* String contains only wstr byte characters. This is only possible 389 when the string was created with a legacy API and _PyUnicode_Ready() 390 has not been called yet. */ 391#define PyUnicode_WCHAR_KIND 0 392 393/* Return values of the PyUnicode_KIND() macro: */ 394 395#define PyUnicode_1BYTE_KIND 1 396#define PyUnicode_2BYTE_KIND 2 397#define PyUnicode_4BYTE_KIND 3 398 399 400/* Return the number of bytes the string uses to represent single characters, 401 this can be 1, 2 or 4. 402 403 See also PyUnicode_KIND_SIZE(). */ 404#define PyUnicode_CHARACTER_SIZE(op) \ 405 (1 << (PyUnicode_KIND(op) - 1)) 406 407/* Return pointers to the canonical representation casted as unsigned char, 408 Py_UCS2, or Py_UCS4 for direct character access. 409 No checks are performed, use PyUnicode_CHARACTER_SIZE or 410 PyUnicode_KIND() before to ensure these will work correctly. */ 411 412#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) 413#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) 414#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) 415 416/* Return one of the PyUnicode_*_KIND values defined above. */ 417#define PyUnicode_KIND(op) \ 418 (assert(PyUnicode_Check(op)), \ 419 assert(PyUnicode_IS_READY(op)), \ 420 ((PyASCIIObject *)(op))->state.kind) 421 422/* Return a void pointer to the raw unicode buffer. */ 423#define _PyUnicode_COMPACT_DATA(op) \ 424 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 425 ((void*)((PyASCIIObject*)(op) + 1)) : \ 426 ((void*)((PyCompactUnicodeObject*)(op) + 1))) 427 428#define _PyUnicode_NONCOMPACT_DATA(op) \ 429 (assert(((PyUnicodeObject*)(op))->data.any), \ 430 ((((PyUnicodeObject *)(op))->data.any))) 431 432#define PyUnicode_DATA(op) \ 433 (assert(PyUnicode_Check(op)), \ 434 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ 435 _PyUnicode_NONCOMPACT_DATA(op)) 436 437/* Compute (index * char_size) where char_size is 2 ** (kind - 1). 438 The index is a character index, the result is a size in bytes. 439 440 See also PyUnicode_CHARACTER_SIZE(). */ 441#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1)) 442 443/* In the access macros below, "kind" may be evaluated more than once. 444 All other macro parameters are evaluated exactly once, so it is safe 445 to put side effects into them (such as increasing the index). */ 446 447/* Write into the canonical representation, this macro does not do any sanity 448 checks and is intended for usage in loops. The caller should cache the 449 kind and data pointers optained form other macro calls. 450 index is the index in the string (starts at 0) and value is the new 451 code point value which shoule be written to that location. */ 452#define PyUnicode_WRITE(kind, data, index, value) \ 453 do { \ 454 switch ((kind)) { \ 455 case PyUnicode_1BYTE_KIND: { \ 456 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ 457 break; \ 458 } \ 459 case PyUnicode_2BYTE_KIND: { \ 460 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ 461 break; \ 462 } \ 463 default: { \ 464 assert((kind) == PyUnicode_4BYTE_KIND); \ 465 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ 466 } \ 467 } \ 468 } while (0) 469 470/* Read a code point form the string's canonical representation. No checks 471 or ready calls are performed. */ 472#define PyUnicode_READ(kind, data, index) \ 473 ((Py_UCS4) \ 474 ((kind) == PyUnicode_1BYTE_KIND ? \ 475 ((const Py_UCS1 *)(data))[(index)] : \ 476 ((kind) == PyUnicode_2BYTE_KIND ? \ 477 ((const Py_UCS2 *)(data))[(index)] : \ 478 ((const Py_UCS4 *)(data))[(index)] \ 479 ) \ 480 )) 481 482/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 483 calls PyUnicode_KIND() and might call it twice. For single reads, use 484 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 485 cache kind and use PyUnicode_READ instead. */ 486#define PyUnicode_READ_CHAR(unicode, index) \ 487 (assert(PyUnicode_Check(unicode)), \ 488 assert(PyUnicode_IS_READY(unicode)), \ 489 (Py_UCS4) \ 490 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ 491 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ 492 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ 493 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ 494 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ 495 ) \ 496 )) 497 498/* Returns the length of the unicode string. The caller has to make sure that 499 the string has it's canonical representation set before calling 500 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ 501#define PyUnicode_GET_LENGTH(op) \ 502 (assert(PyUnicode_Check(op)), \ 503 assert(PyUnicode_IS_READY(op)), \ 504 ((PyASCIIObject *)(op))->length) 505 506 507/* Fast check to determine whether an object is ready. Equivalent to 508 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ 509 510#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) 511 512/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best 513 case. If the canonical representation is not yet set, it will still call 514 _PyUnicode_Ready(). 515 Returns 0 on success and -1 on errors. */ 516#define PyUnicode_READY(op) \ 517 (assert(PyUnicode_Check(op)), \ 518 (PyUnicode_IS_READY(op) ? \ 519 0 : _PyUnicode_Ready((PyObject *)(op)))) 520 521/* Return a maximum character value which is suitable for creating another 522 string based on op. This is always an approximation but more efficient 523 than interating over the string. */ 524#define PyUnicode_MAX_CHAR_VALUE(op) \ 525 (assert(PyUnicode_IS_READY(op)), \ 526 (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \ 527 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ 528 (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \ 529 (0x7fU) : (0xffU) \ 530 ) : \ 531 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ 532 (0xffffU) : (0x10ffffU) \ 533 )))) 534 535#endif 536 537/* --- Constants ---------------------------------------------------------- */ 538 539/* This Unicode character will be used as replacement character during 540 decoding if the errors argument is set to "replace". Note: the 541 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 542 Unicode 3.0. */ 543 544#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 545 546/* === Public API ========================================================= */ 547 548/* --- Plain Py_UNICODE --------------------------------------------------- */ 549 550/* With PEP 393, this is the recommended way to allocate a new unicode object. 551 This function will allocate the object and its buffer in a single memory 552 block. Objects created using this function are not resizable. */ 553#ifndef Py_LIMITED_API 554PyAPI_FUNC(PyObject*) PyUnicode_New( 555 Py_ssize_t size, /* Number of code points in the new string */ 556 Py_UCS4 maxchar /* maximum code point value in the string */ 557 ); 558#endif 559 560/* Initializes the canonical string representation from a the deprecated 561 wstr/Py_UNICODE representation. This function is used to convert Unicode 562 objects which were created using the old API to the new flexible format 563 introduced with PEP 393. 564 565 Don't call this function directly, use the public PyUnicode_READY() macro 566 instead. */ 567#ifndef Py_LIMITED_API 568PyAPI_FUNC(int) _PyUnicode_Ready( 569 PyObject *unicode /* Unicode object */ 570 ); 571#endif 572 573/* Get a copy of a Unicode string. */ 574PyAPI_FUNC(PyObject*) PyUnicode_Copy( 575 PyObject *unicode 576 ); 577 578/* Copy character from one unicode object into another, this function performs 579 character conversion when necessary and falls back to memcpy if possible. 580 581 Fail if to is too small (smaller than how_many or smaller than 582 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 583 kind(to), or if to has more than 1 reference. 584 585 Return the number of written character, or return -1 and raise an exception 586 on error. 587 588 Pseudo-code: 589 590 how_many = min(how_many, len(from) - from_start) 591 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 592 return how_many 593 594 Note: The function doesn't write a terminating null character. 595 */ 596#ifndef Py_LIMITED_API 597PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 598 PyObject *to, 599 Py_ssize_t to_start, 600 PyObject *from, 601 Py_ssize_t from_start, 602 Py_ssize_t how_many 603 ); 604#endif 605 606/* Create a Unicode Object from the Py_UNICODE buffer u of the given 607 size. 608 609 u may be NULL which causes the contents to be undefined. It is the 610 user's responsibility to fill in the needed data afterwards. Note 611 that modifying the Unicode object contents after construction is 612 only allowed if u was set to NULL. 613 614 The buffer is copied into the new object. */ 615 616#ifndef Py_LIMITED_API 617PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 618 const Py_UNICODE *u, /* Unicode buffer */ 619 Py_ssize_t size /* size of buffer */ 620 ); 621#endif 622 623/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 624PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 625 const char *u, /* UTF-8 encoded string */ 626 Py_ssize_t size /* size of buffer */ 627 ); 628 629/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 630 UTF-8 encoded bytes. The size is determined with strlen(). */ 631PyAPI_FUNC(PyObject*) PyUnicode_FromString( 632 const char *u /* UTF-8 encoded string */ 633 ); 634 635#ifndef Py_LIMITED_API 636PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 637 int kind, 638 const void *buffer, 639 Py_ssize_t size); 640#endif 641 642PyAPI_FUNC(PyObject*) PyUnicode_Substring( 643 PyObject *str, 644 Py_ssize_t start, 645 Py_ssize_t end); 646 647/* Copy the string into a UCS4 buffer including the null character is copy_null 648 is set. Return NULL and raise an exception on error. Raise a ValueError if 649 the buffer is smaller than the string. Return buffer on success. 650 651 buflen is the length of the buffer in (Py_UCS4) characters. */ 652PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 653 PyObject *unicode, 654 Py_UCS4* buffer, 655 Py_ssize_t buflen, 656 int copy_null); 657 658/* Copy the string into a UCS4 buffer. A new buffer is allocated using 659 * PyMem_Malloc; if this fails, NULL is returned with a memory error 660 exception set. */ 661PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 662 663/* Return a read-only pointer to the Unicode object's internal 664 Py_UNICODE buffer. 665 If the wchar_t/Py_UNICODE representation is not yet available, this 666 function will calculate it. */ 667 668#ifndef Py_LIMITED_API 669PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 670 PyObject *unicode /* Unicode object */ 671 ); 672#endif 673 674/* Return a read-only pointer to the Unicode object's internal 675 Py_UNICODE buffer and save the length at size. 676 If the wchar_t/Py_UNICODE representation is not yet available, this 677 function will calculate it. */ 678 679#ifndef Py_LIMITED_API 680PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( 681 PyObject *unicode, /* Unicode object */ 682 Py_ssize_t *size /* location where to save the length */ 683 ); 684#endif 685 686/* Get the length of the Unicode object. */ 687 688PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 689 PyObject *unicode 690); 691 692/* Get the number of Py_UNICODE units in the 693 string representation. */ 694 695PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 696 PyObject *unicode /* Unicode object */ 697 ); 698 699/* Read a character from the string. */ 700 701PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 702 PyObject *unicode, 703 Py_ssize_t index 704 ); 705 706/* Write a character to the string. The string must have been created through 707 PyUnicode_New, must not be shared, and must not have been hashed yet. 708 709 Return 0 on success, -1 on error. */ 710 711PyAPI_FUNC(int) PyUnicode_WriteChar( 712 PyObject *unicode, 713 Py_ssize_t index, 714 Py_UCS4 character 715 ); 716 717#ifndef Py_LIMITED_API 718/* Get the maximum ordinal for a Unicode character. */ 719PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 720#endif 721 722/* Resize an already allocated Unicode object to the new size length. 723 724 *unicode is modified to point to the new (resized) object and 0 725 returned on success. 726 727 This API may only be called by the function which also called the 728 Unicode constructor. The refcount on the object must be 1. Otherwise, 729 an error is returned. 730 731 Error handling is implemented as follows: an exception is set, -1 732 is returned and *unicode left untouched. 733 734*/ 735 736PyAPI_FUNC(int) PyUnicode_Resize( 737 PyObject **unicode, /* Pointer to the Unicode object */ 738 Py_ssize_t length /* New length */ 739 ); 740 741/* Coerce obj to an Unicode object and return a reference with 742 *incremented* refcount. 743 744 Coercion is done in the following way: 745 746 1. bytes, bytearray and other char buffer compatible objects are decoded 747 under the assumptions that they contain data using the UTF-8 748 encoding. Decoding is done in "strict" mode. 749 750 2. All other objects (including Unicode objects) raise an 751 exception. 752 753 The API returns NULL in case of an error. The caller is responsible 754 for decref'ing the returned objects. 755 756*/ 757 758PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 759 register PyObject *obj, /* Object */ 760 const char *encoding, /* encoding */ 761 const char *errors /* error handling */ 762 ); 763 764/* Coerce obj to an Unicode object and return a reference with 765 *incremented* refcount. 766 767 Unicode objects are passed back as-is (subclasses are converted to 768 true Unicode objects), all other objects are delegated to 769 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 770 using UTF-8 encoding as basis for decoding the object. 771 772 The API returns NULL in case of an error. The caller is responsible 773 for decref'ing the returned objects. 774 775*/ 776 777PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 778 register PyObject *obj /* Object */ 779 ); 780 781PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 782 const char *format, /* ASCII-encoded string */ 783 va_list vargs 784 ); 785PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 786 const char *format, /* ASCII-encoded string */ 787 ... 788 ); 789 790#ifndef Py_LIMITED_API 791/* Format the object based on the format_spec, as defined in PEP 3101 792 (Advanced String Formatting). */ 793PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj, 794 PyObject *format_spec, 795 Py_ssize_t start, 796 Py_ssize_t end); 797#endif 798 799PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 800PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 801PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 802 const char *u /* UTF-8 encoded string */ 803 ); 804#ifndef Py_LIMITED_API 805PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 806#endif 807 808/* Use only if you know it's a string */ 809#define PyUnicode_CHECK_INTERNED(op) \ 810 (((PyASCIIObject *)(op))->state.interned) 811 812/* --- wchar_t support for platforms which support it --------------------- */ 813 814#ifdef HAVE_WCHAR_H 815 816/* Create a Unicode Object from the wchar_t buffer w of the given 817 size. 818 819 The buffer is copied into the new object. */ 820 821PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 822 register const wchar_t *w, /* wchar_t buffer */ 823 Py_ssize_t size /* size of buffer */ 824 ); 825 826/* Copies the Unicode Object contents into the wchar_t buffer w. At 827 most size wchar_t characters are copied. 828 829 Note that the resulting wchar_t string may or may not be 830 0-terminated. It is the responsibility of the caller to make sure 831 that the wchar_t string is 0-terminated in case this is required by 832 the application. 833 834 Returns the number of wchar_t characters copied (excluding a 835 possibly trailing 0-termination character) or -1 in case of an 836 error. */ 837 838PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 839 PyObject *unicode, /* Unicode object */ 840 register wchar_t *w, /* wchar_t buffer */ 841 Py_ssize_t size /* size of buffer */ 842 ); 843 844/* Convert the Unicode object to a wide character string. The output string 845 always ends with a nul character. If size is not NULL, write the number of 846 wide characters (excluding the null character) into *size. 847 848 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) 849 on success. On error, returns NULL, *size is undefined and raises a 850 MemoryError. */ 851 852PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 853 PyObject *unicode, /* Unicode object */ 854 Py_ssize_t *size /* number of characters of the result */ 855 ); 856 857#ifndef Py_LIMITED_API 858PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); 859#endif 860 861#endif 862 863/* --- Unicode ordinals --------------------------------------------------- */ 864 865/* Create a Unicode Object from the given Unicode code point ordinal. 866 867 The ordinal must be in range(0x10000) on narrow Python builds 868 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 869 raised in case it is not. 870 871*/ 872 873PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 874 875/* --- Free-list management ----------------------------------------------- */ 876 877/* Clear the free list used by the Unicode implementation. 878 879 This can be used to release memory used for objects on the free 880 list back to the Python memory allocator. 881 882*/ 883 884PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 885 886/* === Builtin Codecs ===================================================== 887 888 Many of these APIs take two arguments encoding and errors. These 889 parameters encoding and errors have the same semantics as the ones 890 of the builtin str() API. 891 892 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 893 894 Error handling is set by errors which may also be set to NULL 895 meaning to use the default handling defined for the codec. Default 896 error handling for all builtin codecs is "strict" (ValueErrors are 897 raised). 898 899 The codecs all use a similar interface. Only deviation from the 900 generic ones are documented. 901 902*/ 903 904/* --- Manage the default encoding ---------------------------------------- */ 905 906/* Returns a pointer to the default encoding (UTF-8) of the 907 Unicode object unicode and the size of the encoded representation 908 in bytes stored in *size. 909 910 In case of an error, no *size is set. 911 912 This funcation caches the UTF-8 encoded string in the unicodeobject 913 and subsequent calls will return the same string. The memory is relased 914 when the unicodeobject is deallocated. 915 916 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to 917 support the previous internal function with the same behaviour. 918 919 *** This API is for interpreter INTERNAL USE ONLY and will likely 920 *** be removed or changed in the future. 921 922 *** If you need to access the Unicode object as UTF-8 bytes string, 923 *** please use PyUnicode_AsUTF8String() instead. 924*/ 925 926#ifndef Py_LIMITED_API 927PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( 928 PyObject *unicode, 929 Py_ssize_t *size); 930#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize 931#endif 932 933/* Returns a pointer to the default encoding (UTF-8) of the 934 Unicode object unicode. 935 936 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 937 in the unicodeobject. 938 939 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 940 support the previous internal function with the same behaviour. 941 942 Use of this API is DEPRECATED since no size information can be 943 extracted from the returned data. 944 945 *** This API is for interpreter INTERNAL USE ONLY and will likely 946 *** be removed or changed for Python 3.1. 947 948 *** If you need to access the Unicode object as UTF-8 bytes string, 949 *** please use PyUnicode_AsUTF8String() instead. 950 951*/ 952 953#ifndef Py_LIMITED_API 954PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); 955#define _PyUnicode_AsString PyUnicode_AsUTF8 956#endif 957 958/* Returns "utf-8". */ 959 960PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 961 962/* --- Generic Codecs ----------------------------------------------------- */ 963 964/* Create a Unicode object by decoding the encoded string s of the 965 given size. */ 966 967PyAPI_FUNC(PyObject*) PyUnicode_Decode( 968 const char *s, /* encoded string */ 969 Py_ssize_t size, /* size of buffer */ 970 const char *encoding, /* encoding */ 971 const char *errors /* error handling */ 972 ); 973 974/* Decode a Unicode object unicode and return the result as Python 975 object. */ 976 977PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 978 PyObject *unicode, /* Unicode object */ 979 const char *encoding, /* encoding */ 980 const char *errors /* error handling */ 981 ); 982 983/* Decode a Unicode object unicode and return the result as Unicode 984 object. */ 985 986PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 987 PyObject *unicode, /* Unicode object */ 988 const char *encoding, /* encoding */ 989 const char *errors /* error handling */ 990 ); 991 992/* Encodes a Py_UNICODE buffer of the given size and returns a 993 Python string object. */ 994 995#ifndef Py_LIMITED_API 996PyAPI_FUNC(PyObject*) PyUnicode_Encode( 997 const Py_UNICODE *s, /* Unicode char buffer */ 998 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 999 const char *encoding, /* encoding */ 1000 const char *errors /* error handling */ 1001 ); 1002#endif 1003 1004/* Encodes a Unicode object and returns the result as Python 1005 object. */ 1006 1007PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 1008 PyObject *unicode, /* Unicode object */ 1009 const char *encoding, /* encoding */ 1010 const char *errors /* error handling */ 1011 ); 1012 1013/* Encodes a Unicode object and returns the result as Python string 1014 object. */ 1015 1016PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 1017 PyObject *unicode, /* Unicode object */ 1018 const char *encoding, /* encoding */ 1019 const char *errors /* error handling */ 1020 ); 1021 1022/* Encodes a Unicode object and returns the result as Unicode 1023 object. */ 1024 1025PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 1026 PyObject *unicode, /* Unicode object */ 1027 const char *encoding, /* encoding */ 1028 const char *errors /* error handling */ 1029 ); 1030 1031/* Build an encoding map. */ 1032 1033PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 1034 PyObject* string /* 256 character map */ 1035 ); 1036 1037/* --- UTF-7 Codecs ------------------------------------------------------- */ 1038 1039PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 1040 const char *string, /* UTF-7 encoded string */ 1041 Py_ssize_t length, /* size of string */ 1042 const char *errors /* error handling */ 1043 ); 1044 1045PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 1046 const char *string, /* UTF-7 encoded string */ 1047 Py_ssize_t length, /* size of string */ 1048 const char *errors, /* error handling */ 1049 Py_ssize_t *consumed /* bytes consumed */ 1050 ); 1051 1052#ifndef Py_LIMITED_API 1053PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 1054 const Py_UNICODE *data, /* Unicode char buffer */ 1055 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1056 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1057 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1058 const char *errors /* error handling */ 1059 ); 1060#endif 1061 1062/* --- UTF-8 Codecs ------------------------------------------------------- */ 1063 1064PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 1065 const char *string, /* UTF-8 encoded string */ 1066 Py_ssize_t length, /* size of string */ 1067 const char *errors /* error handling */ 1068 ); 1069 1070PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 1071 const char *string, /* UTF-8 encoded string */ 1072 Py_ssize_t length, /* size of string */ 1073 const char *errors, /* error handling */ 1074 Py_ssize_t *consumed /* bytes consumed */ 1075 ); 1076 1077PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 1078 PyObject *unicode /* Unicode object */ 1079 ); 1080 1081#ifndef Py_LIMITED_API 1082PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 1083 PyObject *unicode, 1084 const char *errors); 1085 1086PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 1087 const Py_UNICODE *data, /* Unicode char buffer */ 1088 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1089 const char *errors /* error handling */ 1090 ); 1091#endif 1092 1093/* --- UTF-32 Codecs ------------------------------------------------------ */ 1094 1095/* Decodes length bytes from a UTF-32 encoded buffer string and returns 1096 the corresponding Unicode object. 1097 1098 errors (if non-NULL) defines the error handling. It defaults 1099 to "strict". 1100 1101 If byteorder is non-NULL, the decoder starts decoding using the 1102 given byte order: 1103 1104 *byteorder == -1: little endian 1105 *byteorder == 0: native order 1106 *byteorder == 1: big endian 1107 1108 In native mode, the first four bytes of the stream are checked for a 1109 BOM mark. If found, the BOM mark is analysed, the byte order 1110 adjusted and the BOM skipped. In the other modes, no BOM mark 1111 interpretation is done. After completion, *byteorder is set to the 1112 current byte order at the end of input data. 1113 1114 If byteorder is NULL, the codec starts in native order mode. 1115 1116*/ 1117 1118PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 1119 const char *string, /* UTF-32 encoded string */ 1120 Py_ssize_t length, /* size of string */ 1121 const char *errors, /* error handling */ 1122 int *byteorder /* pointer to byteorder to use 1123 0=native;-1=LE,1=BE; updated on 1124 exit */ 1125 ); 1126 1127PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 1128 const char *string, /* UTF-32 encoded string */ 1129 Py_ssize_t length, /* size of string */ 1130 const char *errors, /* error handling */ 1131 int *byteorder, /* pointer to byteorder to use 1132 0=native;-1=LE,1=BE; updated on 1133 exit */ 1134 Py_ssize_t *consumed /* bytes consumed */ 1135 ); 1136 1137/* Returns a Python string using the UTF-32 encoding in native byte 1138 order. The string always starts with a BOM mark. */ 1139 1140PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 1141 PyObject *unicode /* Unicode object */ 1142 ); 1143 1144/* Returns a Python string object holding the UTF-32 encoded value of 1145 the Unicode data. 1146 1147 If byteorder is not 0, output is written according to the following 1148 byte order: 1149 1150 byteorder == -1: little endian 1151 byteorder == 0: native byte order (writes a BOM mark) 1152 byteorder == 1: big endian 1153 1154 If byteorder is 0, the output string will always start with the 1155 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1156 prepended. 1157 1158*/ 1159 1160#ifndef Py_LIMITED_API 1161PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 1162 const Py_UNICODE *data, /* Unicode char buffer */ 1163 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1164 const char *errors, /* error handling */ 1165 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1166 ); 1167#endif 1168 1169/* --- UTF-16 Codecs ------------------------------------------------------ */ 1170 1171/* Decodes length bytes from a UTF-16 encoded buffer string and returns 1172 the corresponding Unicode object. 1173 1174 errors (if non-NULL) defines the error handling. It defaults 1175 to "strict". 1176 1177 If byteorder is non-NULL, the decoder starts decoding using the 1178 given byte order: 1179 1180 *byteorder == -1: little endian 1181 *byteorder == 0: native order 1182 *byteorder == 1: big endian 1183 1184 In native mode, the first two bytes of the stream are checked for a 1185 BOM mark. If found, the BOM mark is analysed, the byte order 1186 adjusted and the BOM skipped. In the other modes, no BOM mark 1187 interpretation is done. After completion, *byteorder is set to the 1188 current byte order at the end of input data. 1189 1190 If byteorder is NULL, the codec starts in native order mode. 1191 1192*/ 1193 1194PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 1195 const char *string, /* UTF-16 encoded string */ 1196 Py_ssize_t length, /* size of string */ 1197 const char *errors, /* error handling */ 1198 int *byteorder /* pointer to byteorder to use 1199 0=native;-1=LE,1=BE; updated on 1200 exit */ 1201 ); 1202 1203PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 1204 const char *string, /* UTF-16 encoded string */ 1205 Py_ssize_t length, /* size of string */ 1206 const char *errors, /* error handling */ 1207 int *byteorder, /* pointer to byteorder to use 1208 0=native;-1=LE,1=BE; updated on 1209 exit */ 1210 Py_ssize_t *consumed /* bytes consumed */ 1211 ); 1212 1213/* Returns a Python string using the UTF-16 encoding in native byte 1214 order. The string always starts with a BOM mark. */ 1215 1216PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 1217 PyObject *unicode /* Unicode object */ 1218 ); 1219 1220/* Returns a Python string object holding the UTF-16 encoded value of 1221 the Unicode data. 1222 1223 If byteorder is not 0, output is written according to the following 1224 byte order: 1225 1226 byteorder == -1: little endian 1227 byteorder == 0: native byte order (writes a BOM mark) 1228 byteorder == 1: big endian 1229 1230 If byteorder is 0, the output string will always start with the 1231 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1232 prepended. 1233 1234 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 1235 UCS-2. This trick makes it possible to add full UTF-16 capabilities 1236 at a later point without compromising the APIs. 1237 1238*/ 1239 1240#ifndef Py_LIMITED_API 1241PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 1242 const Py_UNICODE *data, /* Unicode char buffer */ 1243 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1244 const char *errors, /* error handling */ 1245 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1246 ); 1247#endif 1248 1249/* --- Unicode-Escape Codecs ---------------------------------------------- */ 1250 1251PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1252 const char *string, /* Unicode-Escape encoded string */ 1253 Py_ssize_t length, /* size of string */ 1254 const char *errors /* error handling */ 1255 ); 1256 1257PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1258 PyObject *unicode /* Unicode object */ 1259 ); 1260 1261#ifndef Py_LIMITED_API 1262PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1263 const Py_UNICODE *data, /* Unicode char buffer */ 1264 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1265 ); 1266#endif 1267 1268/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1269 1270PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1271 const char *string, /* Raw-Unicode-Escape encoded string */ 1272 Py_ssize_t length, /* size of string */ 1273 const char *errors /* error handling */ 1274 ); 1275 1276PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1277 PyObject *unicode /* Unicode object */ 1278 ); 1279 1280#ifndef Py_LIMITED_API 1281PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1282 const Py_UNICODE *data, /* Unicode char buffer */ 1283 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1284 ); 1285#endif 1286 1287/* --- Unicode Internal Codec --------------------------------------------- 1288 1289 Only for internal use in _codecsmodule.c */ 1290 1291#ifndef Py_LIMITED_API 1292PyObject *_PyUnicode_DecodeUnicodeInternal( 1293 const char *string, 1294 Py_ssize_t length, 1295 const char *errors 1296 ); 1297#endif 1298 1299/* --- Latin-1 Codecs ----------------------------------------------------- 1300 1301 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1302 1303*/ 1304 1305PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1306 const char *string, /* Latin-1 encoded string */ 1307 Py_ssize_t length, /* size of string */ 1308 const char *errors /* error handling */ 1309 ); 1310 1311PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1312 PyObject *unicode /* Unicode object */ 1313 ); 1314 1315#ifndef Py_LIMITED_API 1316PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 1317 PyObject* unicode, 1318 const char* errors); 1319 1320PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1321 const Py_UNICODE *data, /* Unicode char buffer */ 1322 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1323 const char *errors /* error handling */ 1324 ); 1325#endif 1326 1327/* --- ASCII Codecs ------------------------------------------------------- 1328 1329 Only 7-bit ASCII data is excepted. All other codes generate errors. 1330 1331*/ 1332 1333PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1334 const char *string, /* ASCII encoded string */ 1335 Py_ssize_t length, /* size of string */ 1336 const char *errors /* error handling */ 1337 ); 1338 1339PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1340 PyObject *unicode /* Unicode object */ 1341 ); 1342 1343#ifndef Py_LIMITED_API 1344PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 1345 PyObject* unicode, 1346 const char* errors); 1347 1348PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1349 const Py_UNICODE *data, /* Unicode char buffer */ 1350 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1351 const char *errors /* error handling */ 1352 ); 1353#endif 1354 1355/* --- Character Map Codecs ----------------------------------------------- 1356 1357 This codec uses mappings to encode and decode characters. 1358 1359 Decoding mappings must map single string characters to single 1360 Unicode characters, integers (which are then interpreted as Unicode 1361 ordinals) or None (meaning "undefined mapping" and causing an 1362 error). 1363 1364 Encoding mappings must map single Unicode characters to single 1365 string characters, integers (which are then interpreted as Latin-1 1366 ordinals) or None (meaning "undefined mapping" and causing an 1367 error). 1368 1369 If a character lookup fails with a LookupError, the character is 1370 copied as-is meaning that its ordinal value will be interpreted as 1371 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1372 to contain those mappings which map characters to different code 1373 points. 1374 1375*/ 1376 1377PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1378 const char *string, /* Encoded string */ 1379 Py_ssize_t length, /* size of string */ 1380 PyObject *mapping, /* character mapping 1381 (char ordinal -> unicode ordinal) */ 1382 const char *errors /* error handling */ 1383 ); 1384 1385PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1386 PyObject *unicode, /* Unicode object */ 1387 PyObject *mapping /* character mapping 1388 (unicode ordinal -> char ordinal) */ 1389 ); 1390 1391#ifndef Py_LIMITED_API 1392PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1393 const Py_UNICODE *data, /* Unicode char buffer */ 1394 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1395 PyObject *mapping, /* character mapping 1396 (unicode ordinal -> char ordinal) */ 1397 const char *errors /* error handling */ 1398 ); 1399#endif 1400 1401/* Translate a Py_UNICODE buffer of the given length by applying a 1402 character mapping table to it and return the resulting Unicode 1403 object. 1404 1405 The mapping table must map Unicode ordinal integers to Unicode 1406 ordinal integers or None (causing deletion of the character). 1407 1408 Mapping tables may be dictionaries or sequences. Unmapped character 1409 ordinals (ones which cause a LookupError) are left untouched and 1410 are copied as-is. 1411 1412*/ 1413 1414#ifndef Py_LIMITED_API 1415PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1416 const Py_UNICODE *data, /* Unicode char buffer */ 1417 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1418 PyObject *table, /* Translate table */ 1419 const char *errors /* error handling */ 1420 ); 1421#endif 1422 1423#ifdef HAVE_MBCS 1424 1425/* --- MBCS codecs for Windows -------------------------------------------- */ 1426 1427PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1428 const char *string, /* MBCS encoded string */ 1429 Py_ssize_t length, /* size of string */ 1430 const char *errors /* error handling */ 1431 ); 1432 1433PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1434 const char *string, /* MBCS encoded string */ 1435 Py_ssize_t length, /* size of string */ 1436 const char *errors, /* error handling */ 1437 Py_ssize_t *consumed /* bytes consumed */ 1438 ); 1439 1440PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1441 PyObject *unicode /* Unicode object */ 1442 ); 1443 1444#ifndef Py_LIMITED_API 1445PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1446 const Py_UNICODE *data, /* Unicode char buffer */ 1447 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1448 const char *errors /* error handling */ 1449 ); 1450#endif 1451 1452#endif /* HAVE_MBCS */ 1453 1454/* --- Decimal Encoder ---------------------------------------------------- */ 1455 1456/* Takes a Unicode string holding a decimal value and writes it into 1457 an output buffer using standard ASCII digit codes. 1458 1459 The output buffer has to provide at least length+1 bytes of storage 1460 area. The output string is 0-terminated. 1461 1462 The encoder converts whitespace to ' ', decimal characters to their 1463 corresponding ASCII digit and all other Latin-1 characters except 1464 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1465 are treated as errors. This includes embedded NULL bytes. 1466 1467 Error handling is defined by the errors argument: 1468 1469 NULL or "strict": raise a ValueError 1470 "ignore": ignore the wrong characters (these are not copied to the 1471 output buffer) 1472 "replace": replaces illegal characters with '?' 1473 1474 Returns 0 on success, -1 on failure. 1475 1476*/ 1477 1478#ifndef Py_LIMITED_API 1479PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1480 Py_UNICODE *s, /* Unicode buffer */ 1481 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1482 char *output, /* Output buffer; must have size >= length */ 1483 const char *errors /* error handling */ 1484 ); 1485#endif 1486 1487/* Transforms code points that have decimal digit property to the 1488 corresponding ASCII digit code points. 1489 1490 Returns a new Unicode string on success, NULL on failure. 1491*/ 1492 1493#ifndef Py_LIMITED_API 1494PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( 1495 Py_UNICODE *s, /* Unicode buffer */ 1496 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ 1497 ); 1498#endif 1499 1500/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject 1501 as argument instead of a raw buffer and length. This function additionally 1502 transforms spaces to ASCII because this is what the callers in longobject, 1503 floatobject, and complexobject did anyways. */ 1504 1505#ifndef Py_LIMITED_API 1506PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 1507 PyObject *unicode /* Unicode object */ 1508 ); 1509#endif 1510 1511/* --- File system encoding ---------------------------------------------- */ 1512 1513/* ParseTuple converter: encode str objects to bytes using 1514 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1515 1516PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1517 1518/* ParseTuple converter: decode bytes objects to unicode using 1519 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1520 1521PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1522 1523/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1524 and the "surrogateescape" error handler. 1525 1526 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1527 encoding. 1528 1529 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1530*/ 1531 1532PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1533 const char *s /* encoded string */ 1534 ); 1535 1536/* Decode a string using Py_FileSystemDefaultEncoding 1537 and the "surrogateescape" error handler. 1538 1539 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1540 encoding. 1541*/ 1542 1543PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1544 const char *s, /* encoded string */ 1545 Py_ssize_t size /* size */ 1546 ); 1547 1548/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1549 "surrogateescape" error handler, and return bytes. 1550 1551 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1552 encoding. 1553*/ 1554 1555PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1556 PyObject *unicode 1557 ); 1558 1559/* --- Methods & Slots ---------------------------------------------------- 1560 1561 These are capable of handling Unicode objects and strings on input 1562 (we refer to them as strings in the descriptions) and return 1563 Unicode objects or integers as apporpriate. */ 1564 1565/* Concat two strings giving a new Unicode string. */ 1566 1567PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1568 PyObject *left, /* Left string */ 1569 PyObject *right /* Right string */ 1570 ); 1571 1572/* Concat two strings and put the result in *pleft 1573 (sets *pleft to NULL on error) */ 1574 1575PyAPI_FUNC(void) PyUnicode_Append( 1576 PyObject **pleft, /* Pointer to left string */ 1577 PyObject *right /* Right string */ 1578 ); 1579 1580/* Concat two strings, put the result in *pleft and drop the right object 1581 (sets *pleft to NULL on error) */ 1582 1583PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1584 PyObject **pleft, /* Pointer to left string */ 1585 PyObject *right /* Right string */ 1586 ); 1587 1588/* Split a string giving a list of Unicode strings. 1589 1590 If sep is NULL, splitting will be done at all whitespace 1591 substrings. Otherwise, splits occur at the given separator. 1592 1593 At most maxsplit splits will be done. If negative, no limit is set. 1594 1595 Separators are not included in the resulting list. 1596 1597*/ 1598 1599PyAPI_FUNC(PyObject*) PyUnicode_Split( 1600 PyObject *s, /* String to split */ 1601 PyObject *sep, /* String separator */ 1602 Py_ssize_t maxsplit /* Maxsplit count */ 1603 ); 1604 1605/* Dito, but split at line breaks. 1606 1607 CRLF is considered to be one line break. Line breaks are not 1608 included in the resulting list. */ 1609 1610PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1611 PyObject *s, /* String to split */ 1612 int keepends /* If true, line end markers are included */ 1613 ); 1614 1615/* Partition a string using a given separator. */ 1616 1617PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1618 PyObject *s, /* String to partition */ 1619 PyObject *sep /* String separator */ 1620 ); 1621 1622/* Partition a string using a given separator, searching from the end of the 1623 string. */ 1624 1625PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1626 PyObject *s, /* String to partition */ 1627 PyObject *sep /* String separator */ 1628 ); 1629 1630/* Split a string giving a list of Unicode strings. 1631 1632 If sep is NULL, splitting will be done at all whitespace 1633 substrings. Otherwise, splits occur at the given separator. 1634 1635 At most maxsplit splits will be done. But unlike PyUnicode_Split 1636 PyUnicode_RSplit splits from the end of the string. If negative, 1637 no limit is set. 1638 1639 Separators are not included in the resulting list. 1640 1641*/ 1642 1643PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1644 PyObject *s, /* String to split */ 1645 PyObject *sep, /* String separator */ 1646 Py_ssize_t maxsplit /* Maxsplit count */ 1647 ); 1648 1649/* Translate a string by applying a character mapping table to it and 1650 return the resulting Unicode object. 1651 1652 The mapping table must map Unicode ordinal integers to Unicode 1653 ordinal integers or None (causing deletion of the character). 1654 1655 Mapping tables may be dictionaries or sequences. Unmapped character 1656 ordinals (ones which cause a LookupError) are left untouched and 1657 are copied as-is. 1658 1659*/ 1660 1661PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1662 PyObject *str, /* String */ 1663 PyObject *table, /* Translate table */ 1664 const char *errors /* error handling */ 1665 ); 1666 1667/* Join a sequence of strings using the given separator and return 1668 the resulting Unicode string. */ 1669 1670PyAPI_FUNC(PyObject*) PyUnicode_Join( 1671 PyObject *separator, /* Separator string */ 1672 PyObject *seq /* Sequence object */ 1673 ); 1674 1675/* Return 1 if substr matches str[start:end] at the given tail end, 0 1676 otherwise. */ 1677 1678PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1679 PyObject *str, /* String */ 1680 PyObject *substr, /* Prefix or Suffix string */ 1681 Py_ssize_t start, /* Start index */ 1682 Py_ssize_t end, /* Stop index */ 1683 int direction /* Tail end: -1 prefix, +1 suffix */ 1684 ); 1685 1686/* Return the first position of substr in str[start:end] using the 1687 given search direction or -1 if not found. -2 is returned in case 1688 an error occurred and an exception is set. */ 1689 1690PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1691 PyObject *str, /* String */ 1692 PyObject *substr, /* Substring to find */ 1693 Py_ssize_t start, /* Start index */ 1694 Py_ssize_t end, /* Stop index */ 1695 int direction /* Find direction: +1 forward, -1 backward */ 1696 ); 1697 1698/* Like PyUnicode_Find, but search for single character only. */ 1699PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 1700 PyObject *str, 1701 Py_UCS4 ch, 1702 Py_ssize_t start, 1703 Py_ssize_t end, 1704 int direction 1705 ); 1706 1707/* Count the number of occurrences of substr in str[start:end]. */ 1708 1709PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1710 PyObject *str, /* String */ 1711 PyObject *substr, /* Substring to count */ 1712 Py_ssize_t start, /* Start index */ 1713 Py_ssize_t end /* Stop index */ 1714 ); 1715 1716/* Replace at most maxcount occurrences of substr in str with replstr 1717 and return the resulting Unicode object. */ 1718 1719PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1720 PyObject *str, /* String */ 1721 PyObject *substr, /* Substring to find */ 1722 PyObject *replstr, /* Substring to replace */ 1723 Py_ssize_t maxcount /* Max. number of replacements to apply; 1724 -1 = all */ 1725 ); 1726 1727/* Compare two strings and return -1, 0, 1 for less than, equal, 1728 greater than resp. */ 1729 1730PyAPI_FUNC(int) PyUnicode_Compare( 1731 PyObject *left, /* Left string */ 1732 PyObject *right /* Right string */ 1733 ); 1734 1735PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1736 PyObject *left, 1737 const char *right /* ASCII-encoded string */ 1738 ); 1739 1740/* Rich compare two strings and return one of the following: 1741 1742 - NULL in case an exception was raised 1743 - Py_True or Py_False for successfuly comparisons 1744 - Py_NotImplemented in case the type combination is unknown 1745 1746 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1747 case the conversion of the arguments to Unicode fails with a 1748 UnicodeDecodeError. 1749 1750 Possible values for op: 1751 1752 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1753 1754*/ 1755 1756PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1757 PyObject *left, /* Left string */ 1758 PyObject *right, /* Right string */ 1759 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1760 ); 1761 1762/* Apply a argument tuple or dictionary to a format string and return 1763 the resulting Unicode string. */ 1764 1765PyAPI_FUNC(PyObject *) PyUnicode_Format( 1766 PyObject *format, /* Format string */ 1767 PyObject *args /* Argument tuple or dictionary */ 1768 ); 1769 1770/* Checks whether element is contained in container and return 1/0 1771 accordingly. 1772 1773 element has to coerce to an one element Unicode string. -1 is 1774 returned in case of an error. */ 1775 1776PyAPI_FUNC(int) PyUnicode_Contains( 1777 PyObject *container, /* Container string */ 1778 PyObject *element /* Element string */ 1779 ); 1780 1781/* Checks whether argument is a valid identifier. */ 1782 1783PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1784 1785#ifndef Py_LIMITED_API 1786/* Externally visible for str.strip(unicode) */ 1787PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1788 PyUnicodeObject *self, 1789 int striptype, 1790 PyObject *sepobj 1791 ); 1792#endif 1793 1794/* Using the current locale, insert the thousands grouping 1795 into the string pointed to by buffer. For the argument descriptions, 1796 see Objects/stringlib/localeutil.h */ 1797 1798#ifndef Py_LIMITED_API 1799PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer, 1800 Py_ssize_t n_buffer, 1801 Py_UNICODE *digits, 1802 Py_ssize_t n_digits, 1803 Py_ssize_t min_width); 1804#endif 1805 1806/* Using explicit passed-in values, insert the thousands grouping 1807 into the string pointed to by buffer. For the argument descriptions, 1808 see Objects/stringlib/localeutil.h */ 1809#ifndef Py_LIMITED_API 1810PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 1811 int kind, 1812 void *buffer, 1813 Py_ssize_t n_buffer, 1814 void *digits, 1815 Py_ssize_t n_digits, 1816 Py_ssize_t min_width, 1817 const char *grouping, 1818 const char *thousands_sep); 1819#endif 1820/* === Characters Type APIs =============================================== */ 1821 1822/* Helper array used by Py_UNICODE_ISSPACE(). */ 1823 1824#ifndef Py_LIMITED_API 1825PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 1826 1827/* These should not be used directly. Use the Py_UNICODE_IS* and 1828 Py_UNICODE_TO* macros instead. 1829 1830 These APIs are implemented in Objects/unicodectype.c. 1831 1832*/ 1833 1834PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1835 Py_UCS4 ch /* Unicode character */ 1836 ); 1837 1838PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1839 Py_UCS4 ch /* Unicode character */ 1840 ); 1841 1842PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1843 Py_UCS4 ch /* Unicode character */ 1844 ); 1845 1846PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1847 Py_UCS4 ch /* Unicode character */ 1848 ); 1849 1850PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1851 Py_UCS4 ch /* Unicode character */ 1852 ); 1853 1854PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1855 const Py_UCS4 ch /* Unicode character */ 1856 ); 1857 1858PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1859 const Py_UCS4 ch /* Unicode character */ 1860 ); 1861 1862PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 1863 Py_UCS4 ch /* Unicode character */ 1864 ); 1865 1866PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 1867 Py_UCS4 ch /* Unicode character */ 1868 ); 1869 1870PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 1871 Py_UCS4 ch /* Unicode character */ 1872 ); 1873 1874PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1875 Py_UCS4 ch /* Unicode character */ 1876 ); 1877 1878PyAPI_FUNC(int) _PyUnicode_ToDigit( 1879 Py_UCS4 ch /* Unicode character */ 1880 ); 1881 1882PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1883 Py_UCS4 ch /* Unicode character */ 1884 ); 1885 1886PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1887 Py_UCS4 ch /* Unicode character */ 1888 ); 1889 1890PyAPI_FUNC(int) _PyUnicode_IsDigit( 1891 Py_UCS4 ch /* Unicode character */ 1892 ); 1893 1894PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1895 Py_UCS4 ch /* Unicode character */ 1896 ); 1897 1898PyAPI_FUNC(int) _PyUnicode_IsPrintable( 1899 Py_UCS4 ch /* Unicode character */ 1900 ); 1901 1902PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1903 Py_UCS4 ch /* Unicode character */ 1904 ); 1905 1906PyAPI_FUNC(size_t) Py_UNICODE_strlen( 1907 const Py_UNICODE *u 1908 ); 1909 1910PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1911 Py_UNICODE *s1, 1912 const Py_UNICODE *s2); 1913 1914PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 1915 Py_UNICODE *s1, const Py_UNICODE *s2); 1916 1917PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1918 Py_UNICODE *s1, 1919 const Py_UNICODE *s2, 1920 size_t n); 1921 1922PyAPI_FUNC(int) Py_UNICODE_strcmp( 1923 const Py_UNICODE *s1, 1924 const Py_UNICODE *s2 1925 ); 1926 1927PyAPI_FUNC(int) Py_UNICODE_strncmp( 1928 const Py_UNICODE *s1, 1929 const Py_UNICODE *s2, 1930 size_t n 1931 ); 1932 1933PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1934 const Py_UNICODE *s, 1935 Py_UNICODE c 1936 ); 1937 1938PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 1939 const Py_UNICODE *s, 1940 Py_UNICODE c 1941 ); 1942 1943PyAPI_FUNC(size_t) Py_UCS4_strlen( 1944 const Py_UCS4 *u 1945 ); 1946 1947PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy( 1948 Py_UCS4 *s1, 1949 const Py_UCS4 *s2); 1950 1951PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat( 1952 Py_UCS4 *s1, const Py_UCS4 *s2); 1953 1954PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy( 1955 Py_UCS4 *s1, 1956 const Py_UCS4 *s2, 1957 size_t n); 1958 1959PyAPI_FUNC(int) Py_UCS4_strcmp( 1960 const Py_UCS4 *s1, 1961 const Py_UCS4 *s2 1962 ); 1963 1964PyAPI_FUNC(int) Py_UCS4_strncmp( 1965 const Py_UCS4 *s1, 1966 const Py_UCS4 *s2, 1967 size_t n 1968 ); 1969 1970PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr( 1971 const Py_UCS4 *s, 1972 Py_UCS4 c 1973 ); 1974 1975PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr( 1976 const Py_UCS4 *s, 1977 Py_UCS4 c 1978 ); 1979 1980/* Create a copy of a unicode string ending with a nul character. Return NULL 1981 and raise a MemoryError exception on memory allocation failure, otherwise 1982 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 1983 1984PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 1985 PyObject *unicode 1986 ); 1987#endif /* Py_LIMITED_API */ 1988 1989#ifdef __cplusplus 1990} 1991#endif 1992#endif /* !Py_UNICODEOBJECT_H */ 1993