unicodeobject.h revision feb7307db4b4582af9ac01719f7df651c2eed077
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal (see file Misc/unicode.txt). 11 12Copyright (c) Corporation for National Research Initiatives. 13 14 15 Original header: 16 -------------------------------------------------------------------- 17 18 * Yet another Unicode string type for Python. This type supports the 19 * 16-bit Basic Multilingual Plane (BMP) only. 20 * 21 * Written by Fredrik Lundh, January 1999. 22 * 23 * Copyright (c) 1999 by Secret Labs AB. 24 * Copyright (c) 1999 by Fredrik Lundh. 25 * 26 * fredrik@pythonware.com 27 * http://www.pythonware.com 28 * 29 * -------------------------------------------------------------------- 30 * This Unicode String Type is 31 * 32 * Copyright (c) 1999 by Secret Labs AB 33 * Copyright (c) 1999 by Fredrik Lundh 34 * 35 * By obtaining, using, and/or copying this software and/or its 36 * associated documentation, you agree that you have read, understood, 37 * and will comply with the following terms and conditions: 38 * 39 * Permission to use, copy, modify, and distribute this software and its 40 * associated documentation for any purpose and without fee is hereby 41 * granted, provided that the above copyright notice appears in all 42 * copies, and that both that copyright notice and this permission notice 43 * appear in supporting documentation, and that the name of Secret Labs 44 * AB or the author not be used in advertising or publicity pertaining to 45 * distribution of the software without specific, written prior 46 * permission. 47 * 48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 50 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 55 * -------------------------------------------------------------------- */ 56 57#include <ctype.h> 58 59/* === Internal API ======================================================= */ 60 61/* --- Internal Unicode Format -------------------------------------------- */ 62 63/* Python 3.x requires unicode */ 64#define Py_USING_UNICODE 65 66/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 67 properly set, but the default rules below doesn't set it. I'll 68 sort this out some other day -- fredrik@pythonware.com */ 69 70#ifndef Py_UNICODE_SIZE 71#error Must define Py_UNICODE_SIZE 72#endif 73 74/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 75 strings are stored as UCS-2 (with limited support for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Defaults for various platforms */ 87#ifndef PY_UNICODE_TYPE 88 89/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 90# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 91# define HAVE_USABLE_WCHAR_T 92# define PY_UNICODE_TYPE wchar_t 93# endif 94 95# if defined(Py_UNICODE_WIDE) 96# define PY_UNICODE_TYPE Py_UCS4 97# endif 98 99#endif 100 101/* If the compiler provides a wchar_t type we try to support it 102 through the interface functions PyUnicode_FromWideChar() and 103 PyUnicode_AsWideChar(). */ 104 105#ifdef HAVE_USABLE_WCHAR_T 106# ifndef HAVE_WCHAR_H 107# define HAVE_WCHAR_H 108# endif 109#endif 110 111#ifdef HAVE_WCHAR_H 112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 113# ifdef _HAVE_BSDI 114# include <time.h> 115# endif 116# include <wchar.h> 117#endif 118 119/* 120 * Use this typedef when you need to represent a UTF-16 surrogate pair 121 * as single unsigned integer. 122 */ 123#if SIZEOF_INT >= 4 124typedef unsigned int Py_UCS4; 125#elif SIZEOF_LONG >= 4 126typedef unsigned long Py_UCS4; 127#endif 128 129/* Py_UNICODE is the native Unicode storage format (code unit) used by 130 Python and represents a single Unicode element in the Unicode 131 type. */ 132 133typedef PY_UNICODE_TYPE Py_UNICODE; 134 135/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 136 137/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 138 produce different external names and thus cause import errors in 139 case Python interpreters and extensions with mixed compiled in 140 Unicode width assumptions are combined. */ 141 142#ifndef Py_UNICODE_WIDE 143 144# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 145# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 146# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject 147# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode 148# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject 149# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 150# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode 151# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 152# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 153# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String 154# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 155# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 156# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 157# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 158# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 159# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist 160# define PyUnicode_Compare PyUnicodeUCS2_Compare 161# define PyUnicode_CompareWithASCII PyUnicodeUCS2_CompareASCII 162# define PyUnicode_Concat PyUnicodeUCS2_Concat 163# define PyUnicode_Append PyUnicodeUCS2_Append 164# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel 165# define PyUnicode_Contains PyUnicodeUCS2_Contains 166# define PyUnicode_Count PyUnicodeUCS2_Count 167# define PyUnicode_Decode PyUnicodeUCS2_Decode 168# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 169# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 170# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 171# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault 172# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize 173# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 174# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 175# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful 176# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 177# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful 178# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 179# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful 180# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 181# define PyUnicode_Encode PyUnicodeUCS2_Encode 182# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 183# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 184# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 185# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 186# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 187# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 188# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 189# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 190# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 191# define PyUnicode_Find PyUnicodeUCS2_Find 192# define PyUnicode_Format PyUnicodeUCS2_Format 193# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 194# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat 195# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV 196# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 197# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal 198# define PyUnicode_FromString PyUnicodeUCS2_FromString 199# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize 200# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 201# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 202# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter 203# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder 204# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 205# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 206# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 207# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier 208# define PyUnicode_Join PyUnicodeUCS2_Join 209# define PyUnicode_Partition PyUnicodeUCS2_Partition 210# define PyUnicode_RPartition PyUnicodeUCS2_RPartition 211# define PyUnicode_RSplit PyUnicodeUCS2_RSplit 212# define PyUnicode_Replace PyUnicodeUCS2_Replace 213# define PyUnicode_Resize PyUnicodeUCS2_Resize 214# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare 215# define PyUnicode_Split PyUnicodeUCS2_Split 216# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 217# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 218# define PyUnicode_Translate PyUnicodeUCS2_Translate 219# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 220# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 221# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 222# define _PyUnicode_Init _PyUnicodeUCS2_Init 223# define PyUnicode_strdup PyUnicodeUCS2_strdup 224 225#else 226 227# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 228# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 229# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject 230# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode 231# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject 232# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 233# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode 234# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 235# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 236# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String 237# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 238# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 239# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 240# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 241# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 242# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist 243# define PyUnicode_Compare PyUnicodeUCS4_Compare 244# define PyUnicode_CompareWithASCII PyUnicodeUCS4_CompareWithASCII 245# define PyUnicode_Concat PyUnicodeUCS4_Concat 246# define PyUnicode_Append PyUnicodeUCS4_Append 247# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel 248# define PyUnicode_Contains PyUnicodeUCS4_Contains 249# define PyUnicode_Count PyUnicodeUCS4_Count 250# define PyUnicode_Decode PyUnicodeUCS4_Decode 251# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 252# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 253# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 254# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault 255# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize 256# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 257# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 258# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful 259# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 260# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful 261# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 262# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful 263# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 264# define PyUnicode_Encode PyUnicodeUCS4_Encode 265# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 266# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 267# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 268# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 269# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 270# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 271# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 272# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 273# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 274# define PyUnicode_Find PyUnicodeUCS4_Find 275# define PyUnicode_Format PyUnicodeUCS4_Format 276# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 277# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat 278# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV 279# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 280# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal 281# define PyUnicode_FromString PyUnicodeUCS4_FromString 282# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize 283# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 284# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 285# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter 286# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder 287# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 288# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 289# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 290# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier 291# define PyUnicode_Join PyUnicodeUCS4_Join 292# define PyUnicode_Partition PyUnicodeUCS4_Partition 293# define PyUnicode_RPartition PyUnicodeUCS4_RPartition 294# define PyUnicode_RSplit PyUnicodeUCS4_RSplit 295# define PyUnicode_Replace PyUnicodeUCS4_Replace 296# define PyUnicode_Resize PyUnicodeUCS4_Resize 297# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare 298# define PyUnicode_Split PyUnicodeUCS4_Split 299# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 300# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 301# define PyUnicode_Translate PyUnicodeUCS4_Translate 302# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 303# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 304# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 305# define _PyUnicode_Init _PyUnicodeUCS4_Init 306# define PyUnicode_strdup PyUnicodeUCS4_strdup 307 308#endif 309 310/* --- Internal Unicode Operations ---------------------------------------- */ 311 312/* Since splitting on whitespace is an important use case, and 313 whitespace in most situations is solely ASCII whitespace, we 314 optimize for the common case by using a quick look-up table 315 _Py_ascii_whitespace (see below) with an inlined check. 316 317 */ 318#define Py_UNICODE_ISSPACE(ch) \ 319 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 320 321#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 322#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 323#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 324#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 325 326#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 327#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 328#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 329 330#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 331#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 332#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 333#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 334 335#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 336#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 337#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 338 339#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 340 341#define Py_UNICODE_ISALNUM(ch) \ 342 (Py_UNICODE_ISALPHA(ch) || \ 343 Py_UNICODE_ISDECIMAL(ch) || \ 344 Py_UNICODE_ISDIGIT(ch) || \ 345 Py_UNICODE_ISNUMERIC(ch)) 346 347#define Py_UNICODE_COPY(target, source, length) \ 348 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 349 350#define Py_UNICODE_FILL(target, value, length) \ 351 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 352 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 353 } while (0) 354 355/* Check if substring matches at given offset. the offset must be 356 valid, and the substring must not be empty */ 357 358#define Py_UNICODE_MATCH(string, offset, substring) \ 359 ((*((string)->str + (offset)) == *((substring)->str)) && \ 360 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ 361 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) 362 363#ifdef __cplusplus 364extern "C" { 365#endif 366 367/* --- Unicode Type ------------------------------------------------------- */ 368 369typedef struct { 370 PyObject_HEAD 371 Py_ssize_t length; /* Length of raw Unicode data in buffer */ 372 Py_UNICODE *str; /* Raw Unicode buffer */ 373 long hash; /* Hash value; -1 if not set */ 374 int state; /* != 0 if interned. In this case the two 375 * references from the dictionary to this object 376 * are *not* counted in ob_refcnt. */ 377 PyObject *defenc; /* (Default) Encoded version as Python 378 string, or NULL; this is used for 379 implementing the buffer protocol */ 380} PyUnicodeObject; 381 382PyAPI_DATA(PyTypeObject) PyUnicode_Type; 383PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 384 385#define SSTATE_NOT_INTERNED 0 386#define SSTATE_INTERNED_MORTAL 1 387#define SSTATE_INTERNED_IMMORTAL 2 388 389#define PyUnicode_Check(op) \ 390 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 391#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 392 393/* Fast access macros */ 394#define PyUnicode_GET_SIZE(op) \ 395 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) 396#define PyUnicode_GET_DATA_SIZE(op) \ 397 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) 398#define PyUnicode_AS_UNICODE(op) \ 399 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) 400#define PyUnicode_AS_DATA(op) \ 401 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str)) 402 403/* --- Constants ---------------------------------------------------------- */ 404 405/* This Unicode character will be used as replacement character during 406 decoding if the errors argument is set to "replace". Note: the 407 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 408 Unicode 3.0. */ 409 410#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 411 412/* === Public API ========================================================= */ 413 414/* --- Plain Py_UNICODE --------------------------------------------------- */ 415 416/* Create a Unicode Object from the Py_UNICODE buffer u of the given 417 size. 418 419 u may be NULL which causes the contents to be undefined. It is the 420 user's responsibility to fill in the needed data afterwards. Note 421 that modifying the Unicode object contents after construction is 422 only allowed if u was set to NULL. 423 424 The buffer is copied into the new object. */ 425 426PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 427 const Py_UNICODE *u, /* Unicode buffer */ 428 Py_ssize_t size /* size of buffer */ 429 ); 430 431/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 432PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 433 const char *u, /* char buffer */ 434 Py_ssize_t size /* size of buffer */ 435 ); 436 437/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 438 UTF-8 encoded bytes */ 439PyAPI_FUNC(PyObject*) PyUnicode_FromString( 440 const char *u /* string */ 441 ); 442 443/* Return a read-only pointer to the Unicode object's internal 444 Py_UNICODE buffer. */ 445 446PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 447 PyObject *unicode /* Unicode object */ 448 ); 449 450/* Get the length of the Unicode object. */ 451 452PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 453 PyObject *unicode /* Unicode object */ 454 ); 455 456/* Get the maximum ordinal for a Unicode character. */ 457PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 458 459/* Resize an already allocated Unicode object to the new size length. 460 461 *unicode is modified to point to the new (resized) object and 0 462 returned on success. 463 464 This API may only be called by the function which also called the 465 Unicode constructor. The refcount on the object must be 1. Otherwise, 466 an error is returned. 467 468 Error handling is implemented as follows: an exception is set, -1 469 is returned and *unicode left untouched. 470 471*/ 472 473PyAPI_FUNC(int) PyUnicode_Resize( 474 PyObject **unicode, /* Pointer to the Unicode object */ 475 Py_ssize_t length /* New length */ 476 ); 477 478/* Coerce obj to an Unicode object and return a reference with 479 *incremented* refcount. 480 481 Coercion is done in the following way: 482 483 1. bytes, bytearray and other char buffer compatible objects are decoded 484 under the assumptions that they contain data using the current 485 default encoding. Decoding is done in "strict" mode. 486 487 2. All other objects (including Unicode objects) raise an 488 exception. 489 490 The API returns NULL in case of an error. The caller is responsible 491 for decref'ing the returned objects. 492 493*/ 494 495PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 496 register PyObject *obj, /* Object */ 497 const char *encoding, /* encoding */ 498 const char *errors /* error handling */ 499 ); 500 501/* Coerce obj to an Unicode object and return a reference with 502 *incremented* refcount. 503 504 Unicode objects are passed back as-is (subclasses are converted to 505 true Unicode objects), all other objects are delegated to 506 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 507 using UTF-8 encoding as basis for decoding the object. 508 509 The API returns NULL in case of an error. The caller is responsible 510 for decref'ing the returned objects. 511 512*/ 513 514PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 515 register PyObject *obj /* Object */ 516 ); 517 518PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 519 const char *format, /* ASCII-encoded string */ 520 va_list vargs 521 ); 522PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 523 const char *format, /* ASCII-encoded string */ 524 ... 525 ); 526 527/* Format the object based on the format_spec, as defined in PEP 3101 528 (Advanced String Formatting). */ 529PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj, 530 Py_UNICODE *format_spec, 531 Py_ssize_t format_spec_len); 532 533PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 534PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 535PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *); 536PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 537 538/* Use only if you know it's a string */ 539#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) 540 541/* --- wchar_t support for platforms which support it --------------------- */ 542 543#ifdef HAVE_WCHAR_H 544 545/* Create a Unicode Object from the wchar_t buffer w of the given 546 size. 547 548 The buffer is copied into the new object. */ 549 550PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 551 register const wchar_t *w, /* wchar_t buffer */ 552 Py_ssize_t size /* size of buffer */ 553 ); 554 555/* Copies the Unicode Object contents into the wchar_t buffer w. At 556 most size wchar_t characters are copied. 557 558 Note that the resulting wchar_t string may or may not be 559 0-terminated. It is the responsibility of the caller to make sure 560 that the wchar_t string is 0-terminated in case this is required by 561 the application. 562 563 Returns the number of wchar_t characters copied (excluding a 564 possibly trailing 0-termination character) or -1 in case of an 565 error. */ 566 567PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 568 PyUnicodeObject *unicode, /* Unicode object */ 569 register wchar_t *w, /* wchar_t buffer */ 570 Py_ssize_t size /* size of buffer */ 571 ); 572 573#endif 574 575/* --- Unicode ordinals --------------------------------------------------- */ 576 577/* Create a Unicode Object from the given Unicode code point ordinal. 578 579 The ordinal must be in range(0x10000) on narrow Python builds 580 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 581 raised in case it is not. 582 583*/ 584 585PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 586 587/* --- Free-list management ----------------------------------------------- */ 588 589/* Clear the free list used by the Unicode implementation. 590 591 This can be used to release memory used for objects on the free 592 list back to the Python memory allocator. 593 594*/ 595 596PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 597 598/* === Builtin Codecs ===================================================== 599 600 Many of these APIs take two arguments encoding and errors. These 601 parameters encoding and errors have the same semantics as the ones 602 of the builtin unicode() API. 603 604 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 605 606 Error handling is set by errors which may also be set to NULL 607 meaning to use the default handling defined for the codec. Default 608 error handling for all builtin codecs is "strict" (ValueErrors are 609 raised). 610 611 The codecs all use a similar interface. Only deviation from the 612 generic ones are documented. 613 614*/ 615 616/* --- Manage the default encoding ---------------------------------------- */ 617 618/* Return a Python string holding the default encoded value of the 619 Unicode object. 620 621 The resulting string is cached in the Unicode object for subsequent 622 usage by this function. The cached version is needed to implement 623 the character buffer interface and will live (at least) as long as 624 the Unicode object itself. 625 626 The refcount of the string is *not* incremented. 627 628 *** Exported for internal use by the interpreter only !!! *** 629 630*/ 631 632PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( 633 PyObject *unicode, 634 const char *errors); 635 636/* Returns a pointer to the default encoding (normally, UTF-8) of the 637 Unicode object unicode and the size of the encoded representation 638 in bytes stored in *size. 639 640 In case of an error, no *size is set. 641 642 *** This API is for interpreter INTERNAL USE ONLY and will likely 643 *** be removed or changed for Python 3.1. 644 645 *** If you need to access the Unicode object as UTF-8 bytes string, 646 *** please use PyUnicode_AsUTF8String() instead. 647 648*/ 649 650PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize( 651 PyObject *unicode, 652 Py_ssize_t *size); 653 654/* Returns a pointer to the default encoding (normally, UTf-8) of the 655 Unicode object unicode. 656 657 Use of this API is DEPRECATED since no size information can be 658 extracted from the returned data. 659 660 *** This API is for interpreter INTERNAL USE ONLY and will likely 661 *** be removed or changed for Python 3.1. 662 663 *** If you need to access the Unicode object as UTF-8 bytes string, 664 *** please use PyUnicode_AsUTF8String() instead. 665 666*/ 667 668PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode); 669 670/* Returns the currently active default encoding. 671 672 The default encoding is currently implemented as run-time settable 673 process global. This may change in future versions of the 674 interpreter to become a parameter which is managed on a per-thread 675 basis. 676 677 */ 678 679PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 680 681/* --- Generic Codecs ----------------------------------------------------- */ 682 683/* Create a Unicode object by decoding the encoded string s of the 684 given size. */ 685 686PyAPI_FUNC(PyObject*) PyUnicode_Decode( 687 const char *s, /* encoded string */ 688 Py_ssize_t size, /* size of buffer */ 689 const char *encoding, /* encoding */ 690 const char *errors /* error handling */ 691 ); 692 693/* Decode a Unicode object unicode and return the result as Python 694 object. */ 695 696PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 697 PyObject *unicode, /* Unicode object */ 698 const char *encoding, /* encoding */ 699 const char *errors /* error handling */ 700 ); 701 702/* Decode a Unicode object unicode and return the result as Unicode 703 object. */ 704 705PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 706 PyObject *unicode, /* Unicode object */ 707 const char *encoding, /* encoding */ 708 const char *errors /* error handling */ 709 ); 710 711/* Encodes a Py_UNICODE buffer of the given size and returns a 712 Python string object. */ 713 714PyAPI_FUNC(PyObject*) PyUnicode_Encode( 715 const Py_UNICODE *s, /* Unicode char buffer */ 716 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 717 const char *encoding, /* encoding */ 718 const char *errors /* error handling */ 719 ); 720 721/* Encodes a Unicode object and returns the result as Python 722 object. */ 723 724PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 725 PyObject *unicode, /* Unicode object */ 726 const char *encoding, /* encoding */ 727 const char *errors /* error handling */ 728 ); 729 730/* Encodes a Unicode object and returns the result as Python string 731 object. */ 732 733PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 734 PyObject *unicode, /* Unicode object */ 735 const char *encoding, /* encoding */ 736 const char *errors /* error handling */ 737 ); 738 739/* Encodes a Unicode object and returns the result as Unicode 740 object. */ 741 742PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 743 PyObject *unicode, /* Unicode object */ 744 const char *encoding, /* encoding */ 745 const char *errors /* error handling */ 746 ); 747 748/* Build an encoding map. */ 749 750PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 751 PyObject* string /* 256 character map */ 752 ); 753 754/* --- UTF-7 Codecs ------------------------------------------------------- */ 755 756PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 757 const char *string, /* UTF-7 encoded string */ 758 Py_ssize_t length, /* size of string */ 759 const char *errors /* error handling */ 760 ); 761 762PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 763 const char *string, /* UTF-7 encoded string */ 764 Py_ssize_t length, /* size of string */ 765 const char *errors, /* error handling */ 766 Py_ssize_t *consumed /* bytes consumed */ 767 ); 768 769PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 770 const Py_UNICODE *data, /* Unicode char buffer */ 771 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 772 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 773 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 774 const char *errors /* error handling */ 775 ); 776 777/* --- UTF-8 Codecs ------------------------------------------------------- */ 778 779PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 780 const char *string, /* UTF-8 encoded string */ 781 Py_ssize_t length, /* size of string */ 782 const char *errors /* error handling */ 783 ); 784 785PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 786 const char *string, /* UTF-8 encoded string */ 787 Py_ssize_t length, /* size of string */ 788 const char *errors, /* error handling */ 789 Py_ssize_t *consumed /* bytes consumed */ 790 ); 791 792PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 793 PyObject *unicode /* Unicode object */ 794 ); 795 796PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 797 const Py_UNICODE *data, /* Unicode char buffer */ 798 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 799 const char *errors /* error handling */ 800 ); 801 802/* --- UTF-32 Codecs ------------------------------------------------------ */ 803 804/* Decodes length bytes from a UTF-32 encoded buffer string and returns 805 the corresponding Unicode object. 806 807 errors (if non-NULL) defines the error handling. It defaults 808 to "strict". 809 810 If byteorder is non-NULL, the decoder starts decoding using the 811 given byte order: 812 813 *byteorder == -1: little endian 814 *byteorder == 0: native order 815 *byteorder == 1: big endian 816 817 In native mode, the first four bytes of the stream are checked for a 818 BOM mark. If found, the BOM mark is analysed, the byte order 819 adjusted and the BOM skipped. In the other modes, no BOM mark 820 interpretation is done. After completion, *byteorder is set to the 821 current byte order at the end of input data. 822 823 If byteorder is NULL, the codec starts in native order mode. 824 825*/ 826 827PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 828 const char *string, /* UTF-32 encoded string */ 829 Py_ssize_t length, /* size of string */ 830 const char *errors, /* error handling */ 831 int *byteorder /* pointer to byteorder to use 832 0=native;-1=LE,1=BE; updated on 833 exit */ 834 ); 835 836PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 837 const char *string, /* UTF-32 encoded string */ 838 Py_ssize_t length, /* size of string */ 839 const char *errors, /* error handling */ 840 int *byteorder, /* pointer to byteorder to use 841 0=native;-1=LE,1=BE; updated on 842 exit */ 843 Py_ssize_t *consumed /* bytes consumed */ 844 ); 845 846/* Returns a Python string using the UTF-32 encoding in native byte 847 order. The string always starts with a BOM mark. */ 848 849PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 850 PyObject *unicode /* Unicode object */ 851 ); 852 853/* Returns a Python string object holding the UTF-32 encoded value of 854 the Unicode data. 855 856 If byteorder is not 0, output is written according to the following 857 byte order: 858 859 byteorder == -1: little endian 860 byteorder == 0: native byte order (writes a BOM mark) 861 byteorder == 1: big endian 862 863 If byteorder is 0, the output string will always start with the 864 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 865 prepended. 866 867*/ 868 869PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 870 const Py_UNICODE *data, /* Unicode char buffer */ 871 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 872 const char *errors, /* error handling */ 873 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 874 ); 875 876/* --- UTF-16 Codecs ------------------------------------------------------ */ 877 878/* Decodes length bytes from a UTF-16 encoded buffer string and returns 879 the corresponding Unicode object. 880 881 errors (if non-NULL) defines the error handling. It defaults 882 to "strict". 883 884 If byteorder is non-NULL, the decoder starts decoding using the 885 given byte order: 886 887 *byteorder == -1: little endian 888 *byteorder == 0: native order 889 *byteorder == 1: big endian 890 891 In native mode, the first two bytes of the stream are checked for a 892 BOM mark. If found, the BOM mark is analysed, the byte order 893 adjusted and the BOM skipped. In the other modes, no BOM mark 894 interpretation is done. After completion, *byteorder is set to the 895 current byte order at the end of input data. 896 897 If byteorder is NULL, the codec starts in native order mode. 898 899*/ 900 901PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 902 const char *string, /* UTF-16 encoded string */ 903 Py_ssize_t length, /* size of string */ 904 const char *errors, /* error handling */ 905 int *byteorder /* pointer to byteorder to use 906 0=native;-1=LE,1=BE; updated on 907 exit */ 908 ); 909 910PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 911 const char *string, /* UTF-16 encoded string */ 912 Py_ssize_t length, /* size of string */ 913 const char *errors, /* error handling */ 914 int *byteorder, /* pointer to byteorder to use 915 0=native;-1=LE,1=BE; updated on 916 exit */ 917 Py_ssize_t *consumed /* bytes consumed */ 918 ); 919 920/* Returns a Python string using the UTF-16 encoding in native byte 921 order. The string always starts with a BOM mark. */ 922 923PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 924 PyObject *unicode /* Unicode object */ 925 ); 926 927/* Returns a Python string object holding the UTF-16 encoded value of 928 the Unicode data. 929 930 If byteorder is not 0, output is written according to the following 931 byte order: 932 933 byteorder == -1: little endian 934 byteorder == 0: native byte order (writes a BOM mark) 935 byteorder == 1: big endian 936 937 If byteorder is 0, the output string will always start with the 938 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 939 prepended. 940 941 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 942 UCS-2. This trick makes it possible to add full UTF-16 capabilities 943 at a later point without compromising the APIs. 944 945*/ 946 947PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 948 const Py_UNICODE *data, /* Unicode char buffer */ 949 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 950 const char *errors, /* error handling */ 951 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 952 ); 953 954/* --- Unicode-Escape Codecs ---------------------------------------------- */ 955 956PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 957 const char *string, /* Unicode-Escape encoded string */ 958 Py_ssize_t length, /* size of string */ 959 const char *errors /* error handling */ 960 ); 961 962PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 963 PyObject *unicode /* Unicode object */ 964 ); 965 966PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 967 const Py_UNICODE *data, /* Unicode char buffer */ 968 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 969 ); 970 971/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 972 973PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 974 const char *string, /* Raw-Unicode-Escape encoded string */ 975 Py_ssize_t length, /* size of string */ 976 const char *errors /* error handling */ 977 ); 978 979PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 980 PyObject *unicode /* Unicode object */ 981 ); 982 983PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 984 const Py_UNICODE *data, /* Unicode char buffer */ 985 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 986 ); 987 988/* --- Unicode Internal Codec --------------------------------------------- 989 990 Only for internal use in _codecsmodule.c */ 991 992PyObject *_PyUnicode_DecodeUnicodeInternal( 993 const char *string, 994 Py_ssize_t length, 995 const char *errors 996 ); 997 998/* --- Latin-1 Codecs ----------------------------------------------------- 999 1000 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1001 1002*/ 1003 1004PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1005 const char *string, /* Latin-1 encoded string */ 1006 Py_ssize_t length, /* size of string */ 1007 const char *errors /* error handling */ 1008 ); 1009 1010PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1011 PyObject *unicode /* Unicode object */ 1012 ); 1013 1014PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1015 const Py_UNICODE *data, /* Unicode char buffer */ 1016 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1017 const char *errors /* error handling */ 1018 ); 1019 1020/* --- ASCII Codecs ------------------------------------------------------- 1021 1022 Only 7-bit ASCII data is excepted. All other codes generate errors. 1023 1024*/ 1025 1026PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1027 const char *string, /* ASCII encoded string */ 1028 Py_ssize_t length, /* size of string */ 1029 const char *errors /* error handling */ 1030 ); 1031 1032PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1033 PyObject *unicode /* Unicode object */ 1034 ); 1035 1036PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1037 const Py_UNICODE *data, /* Unicode char buffer */ 1038 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1039 const char *errors /* error handling */ 1040 ); 1041 1042/* --- Character Map Codecs ----------------------------------------------- 1043 1044 This codec uses mappings to encode and decode characters. 1045 1046 Decoding mappings must map single string characters to single 1047 Unicode characters, integers (which are then interpreted as Unicode 1048 ordinals) or None (meaning "undefined mapping" and causing an 1049 error). 1050 1051 Encoding mappings must map single Unicode characters to single 1052 string characters, integers (which are then interpreted as Latin-1 1053 ordinals) or None (meaning "undefined mapping" and causing an 1054 error). 1055 1056 If a character lookup fails with a LookupError, the character is 1057 copied as-is meaning that its ordinal value will be interpreted as 1058 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1059 to contain those mappings which map characters to different code 1060 points. 1061 1062*/ 1063 1064PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1065 const char *string, /* Encoded string */ 1066 Py_ssize_t length, /* size of string */ 1067 PyObject *mapping, /* character mapping 1068 (char ordinal -> unicode ordinal) */ 1069 const char *errors /* error handling */ 1070 ); 1071 1072PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1073 PyObject *unicode, /* Unicode object */ 1074 PyObject *mapping /* character mapping 1075 (unicode ordinal -> char ordinal) */ 1076 ); 1077 1078PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1079 const Py_UNICODE *data, /* Unicode char buffer */ 1080 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1081 PyObject *mapping, /* character mapping 1082 (unicode ordinal -> char ordinal) */ 1083 const char *errors /* error handling */ 1084 ); 1085 1086/* Translate a Py_UNICODE buffer of the given length by applying a 1087 character mapping table to it and return the resulting Unicode 1088 object. 1089 1090 The mapping table must map Unicode ordinal integers to Unicode 1091 ordinal integers or None (causing deletion of the character). 1092 1093 Mapping tables may be dictionaries or sequences. Unmapped character 1094 ordinals (ones which cause a LookupError) are left untouched and 1095 are copied as-is. 1096 1097*/ 1098 1099PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1100 const Py_UNICODE *data, /* Unicode char buffer */ 1101 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1102 PyObject *table, /* Translate table */ 1103 const char *errors /* error handling */ 1104 ); 1105 1106#ifdef MS_WIN32 1107 1108/* --- MBCS codecs for Windows -------------------------------------------- */ 1109 1110PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1111 const char *string, /* MBCS encoded string */ 1112 Py_ssize_t length, /* size of string */ 1113 const char *errors /* error handling */ 1114 ); 1115 1116PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1117 const char *string, /* MBCS encoded string */ 1118 Py_ssize_t length, /* size of string */ 1119 const char *errors, /* error handling */ 1120 Py_ssize_t *consumed /* bytes consumed */ 1121 ); 1122 1123PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1124 PyObject *unicode /* Unicode object */ 1125 ); 1126 1127PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1128 const Py_UNICODE *data, /* Unicode char buffer */ 1129 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1130 const char *errors /* error handling */ 1131 ); 1132 1133#endif /* MS_WIN32 */ 1134 1135/* --- Decimal Encoder ---------------------------------------------------- */ 1136 1137/* Takes a Unicode string holding a decimal value and writes it into 1138 an output buffer using standard ASCII digit codes. 1139 1140 The output buffer has to provide at least length+1 bytes of storage 1141 area. The output string is 0-terminated. 1142 1143 The encoder converts whitespace to ' ', decimal characters to their 1144 corresponding ASCII digit and all other Latin-1 characters except 1145 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1146 are treated as errors. This includes embedded NULL bytes. 1147 1148 Error handling is defined by the errors argument: 1149 1150 NULL or "strict": raise a ValueError 1151 "ignore": ignore the wrong characters (these are not copied to the 1152 output buffer) 1153 "replace": replaces illegal characters with '?' 1154 1155 Returns 0 on success, -1 on failure. 1156 1157*/ 1158 1159PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1160 Py_UNICODE *s, /* Unicode buffer */ 1161 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1162 char *output, /* Output buffer; must have size >= length */ 1163 const char *errors /* error handling */ 1164 ); 1165 1166/* --- File system encoding ---------------------------------------------- */ 1167 1168/* ParseTuple converter: encode str objects to bytes using 1169 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1170 1171PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1172 1173/* ParseTuple converter: decode bytes objects to unicode using 1174 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1175 1176PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1177 1178/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1179 and the "surrogateescape" error handler. 1180 1181 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. 1182 1183 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1184*/ 1185 1186PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1187 const char *s /* encoded string */ 1188 ); 1189 1190/* Decode a string using Py_FileSystemDefaultEncoding 1191 and the "surrogateescape" error handler. 1192 1193 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. 1194*/ 1195 1196PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1197 const char *s, /* encoded string */ 1198 Py_ssize_t size /* size */ 1199 ); 1200 1201/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1202 "surrogateescape" error handler, and return bytes. 1203 1204 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. 1205*/ 1206 1207PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1208 PyObject *unicode 1209 ); 1210 1211/* --- Methods & Slots ---------------------------------------------------- 1212 1213 These are capable of handling Unicode objects and strings on input 1214 (we refer to them as strings in the descriptions) and return 1215 Unicode objects or integers as apporpriate. */ 1216 1217/* Concat two strings giving a new Unicode string. */ 1218 1219PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1220 PyObject *left, /* Left string */ 1221 PyObject *right /* Right string */ 1222 ); 1223 1224/* Concat two strings and put the result in *pleft 1225 (sets *pleft to NULL on error) */ 1226 1227PyAPI_FUNC(void) PyUnicode_Append( 1228 PyObject **pleft, /* Pointer to left string */ 1229 PyObject *right /* Right string */ 1230 ); 1231 1232/* Concat two strings, put the result in *pleft and drop the right object 1233 (sets *pleft to NULL on error) */ 1234 1235PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1236 PyObject **pleft, /* Pointer to left string */ 1237 PyObject *right /* Right string */ 1238 ); 1239 1240/* Split a string giving a list of Unicode strings. 1241 1242 If sep is NULL, splitting will be done at all whitespace 1243 substrings. Otherwise, splits occur at the given separator. 1244 1245 At most maxsplit splits will be done. If negative, no limit is set. 1246 1247 Separators are not included in the resulting list. 1248 1249*/ 1250 1251PyAPI_FUNC(PyObject*) PyUnicode_Split( 1252 PyObject *s, /* String to split */ 1253 PyObject *sep, /* String separator */ 1254 Py_ssize_t maxsplit /* Maxsplit count */ 1255 ); 1256 1257/* Dito, but split at line breaks. 1258 1259 CRLF is considered to be one line break. Line breaks are not 1260 included in the resulting list. */ 1261 1262PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1263 PyObject *s, /* String to split */ 1264 int keepends /* If true, line end markers are included */ 1265 ); 1266 1267/* Partition a string using a given separator. */ 1268 1269PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1270 PyObject *s, /* String to partition */ 1271 PyObject *sep /* String separator */ 1272 ); 1273 1274/* Partition a string using a given separator, searching from the end of the 1275 string. */ 1276 1277PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1278 PyObject *s, /* String to partition */ 1279 PyObject *sep /* String separator */ 1280 ); 1281 1282/* Split a string giving a list of Unicode strings. 1283 1284 If sep is NULL, splitting will be done at all whitespace 1285 substrings. Otherwise, splits occur at the given separator. 1286 1287 At most maxsplit splits will be done. But unlike PyUnicode_Split 1288 PyUnicode_RSplit splits from the end of the string. If negative, 1289 no limit is set. 1290 1291 Separators are not included in the resulting list. 1292 1293*/ 1294 1295PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1296 PyObject *s, /* String to split */ 1297 PyObject *sep, /* String separator */ 1298 Py_ssize_t maxsplit /* Maxsplit count */ 1299 ); 1300 1301/* Translate a string by applying a character mapping table to it and 1302 return the resulting Unicode object. 1303 1304 The mapping table must map Unicode ordinal integers to Unicode 1305 ordinal integers or None (causing deletion of the character). 1306 1307 Mapping tables may be dictionaries or sequences. Unmapped character 1308 ordinals (ones which cause a LookupError) are left untouched and 1309 are copied as-is. 1310 1311*/ 1312 1313PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1314 PyObject *str, /* String */ 1315 PyObject *table, /* Translate table */ 1316 const char *errors /* error handling */ 1317 ); 1318 1319/* Join a sequence of strings using the given separator and return 1320 the resulting Unicode string. */ 1321 1322PyAPI_FUNC(PyObject*) PyUnicode_Join( 1323 PyObject *separator, /* Separator string */ 1324 PyObject *seq /* Sequence object */ 1325 ); 1326 1327/* Return 1 if substr matches str[start:end] at the given tail end, 0 1328 otherwise. */ 1329 1330PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1331 PyObject *str, /* String */ 1332 PyObject *substr, /* Prefix or Suffix string */ 1333 Py_ssize_t start, /* Start index */ 1334 Py_ssize_t end, /* Stop index */ 1335 int direction /* Tail end: -1 prefix, +1 suffix */ 1336 ); 1337 1338/* Return the first position of substr in str[start:end] using the 1339 given search direction or -1 if not found. -2 is returned in case 1340 an error occurred and an exception is set. */ 1341 1342PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1343 PyObject *str, /* String */ 1344 PyObject *substr, /* Substring to find */ 1345 Py_ssize_t start, /* Start index */ 1346 Py_ssize_t end, /* Stop index */ 1347 int direction /* Find direction: +1 forward, -1 backward */ 1348 ); 1349 1350/* Count the number of occurrences of substr in str[start:end]. */ 1351 1352PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1353 PyObject *str, /* String */ 1354 PyObject *substr, /* Substring to count */ 1355 Py_ssize_t start, /* Start index */ 1356 Py_ssize_t end /* Stop index */ 1357 ); 1358 1359/* Replace at most maxcount occurrences of substr in str with replstr 1360 and return the resulting Unicode object. */ 1361 1362PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1363 PyObject *str, /* String */ 1364 PyObject *substr, /* Substring to find */ 1365 PyObject *replstr, /* Substring to replace */ 1366 Py_ssize_t maxcount /* Max. number of replacements to apply; 1367 -1 = all */ 1368 ); 1369 1370/* Compare two strings and return -1, 0, 1 for less than, equal, 1371 greater than resp. */ 1372 1373PyAPI_FUNC(int) PyUnicode_Compare( 1374 PyObject *left, /* Left string */ 1375 PyObject *right /* Right string */ 1376 ); 1377 1378PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1379 PyObject *left, 1380 const char *right 1381 ); 1382 1383/* Rich compare two strings and return one of the following: 1384 1385 - NULL in case an exception was raised 1386 - Py_True or Py_False for successfuly comparisons 1387 - Py_NotImplemented in case the type combination is unknown 1388 1389 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1390 case the conversion of the arguments to Unicode fails with a 1391 UnicodeDecodeError. 1392 1393 Possible values for op: 1394 1395 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1396 1397*/ 1398 1399PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1400 PyObject *left, /* Left string */ 1401 PyObject *right, /* Right string */ 1402 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1403 ); 1404 1405/* Apply a argument tuple or dictionary to a format string and return 1406 the resulting Unicode string. */ 1407 1408PyAPI_FUNC(PyObject *) PyUnicode_Format( 1409 PyObject *format, /* Format string */ 1410 PyObject *args /* Argument tuple or dictionary */ 1411 ); 1412 1413/* Checks whether element is contained in container and return 1/0 1414 accordingly. 1415 1416 element has to coerce to an one element Unicode string. -1 is 1417 returned in case of an error. */ 1418 1419PyAPI_FUNC(int) PyUnicode_Contains( 1420 PyObject *container, /* Container string */ 1421 PyObject *element /* Element string */ 1422 ); 1423 1424/* Checks whether argument is a valid identifier. */ 1425 1426PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1427 1428/* Externally visible for str.strip(unicode) */ 1429PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1430 PyUnicodeObject *self, 1431 int striptype, 1432 PyObject *sepobj 1433 ); 1434 1435/* Using the current locale, insert the thousands grouping 1436 into the string pointed to by buffer. For the argument descriptions, 1437 see Objects/stringlib/localeutil.h */ 1438 1439PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer, 1440 Py_ssize_t n_buffer, 1441 Py_UNICODE *digits, 1442 Py_ssize_t n_digits, 1443 Py_ssize_t min_width); 1444 1445/* Using explicit passed-in values, insert the thousands grouping 1446 into the string pointed to by buffer. For the argument descriptions, 1447 see Objects/stringlib/localeutil.h */ 1448PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer, 1449 Py_ssize_t n_buffer, 1450 Py_UNICODE *digits, 1451 Py_ssize_t n_digits, 1452 Py_ssize_t min_width, 1453 const char *grouping, 1454 const char *thousands_sep); 1455/* === Characters Type APIs =============================================== */ 1456 1457/* Helper array used by Py_UNICODE_ISSPACE(). */ 1458 1459PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 1460 1461/* These should not be used directly. Use the Py_UNICODE_IS* and 1462 Py_UNICODE_TO* macros instead. 1463 1464 These APIs are implemented in Objects/unicodectype.c. 1465 1466*/ 1467 1468PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1469 Py_UCS4 ch /* Unicode character */ 1470 ); 1471 1472PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1473 Py_UCS4 ch /* Unicode character */ 1474 ); 1475 1476PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1477 Py_UCS4 ch /* Unicode character */ 1478 ); 1479 1480PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1481 Py_UCS4 ch /* Unicode character */ 1482 ); 1483 1484PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1485 Py_UCS4 ch /* Unicode character */ 1486 ); 1487 1488PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1489 const Py_UCS4 ch /* Unicode character */ 1490 ); 1491 1492PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1493 const Py_UCS4 ch /* Unicode character */ 1494 ); 1495 1496PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 1497 Py_UCS4 ch /* Unicode character */ 1498 ); 1499 1500PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 1501 Py_UCS4 ch /* Unicode character */ 1502 ); 1503 1504PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 1505 Py_UCS4 ch /* Unicode character */ 1506 ); 1507 1508PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1509 Py_UCS4 ch /* Unicode character */ 1510 ); 1511 1512PyAPI_FUNC(int) _PyUnicode_ToDigit( 1513 Py_UCS4 ch /* Unicode character */ 1514 ); 1515 1516PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1517 Py_UCS4 ch /* Unicode character */ 1518 ); 1519 1520PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1521 Py_UCS4 ch /* Unicode character */ 1522 ); 1523 1524PyAPI_FUNC(int) _PyUnicode_IsDigit( 1525 Py_UCS4 ch /* Unicode character */ 1526 ); 1527 1528PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1529 Py_UCS4 ch /* Unicode character */ 1530 ); 1531 1532PyAPI_FUNC(int) _PyUnicode_IsPrintable( 1533 Py_UCS4 ch /* Unicode character */ 1534 ); 1535 1536PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1537 Py_UCS4 ch /* Unicode character */ 1538 ); 1539 1540PyAPI_FUNC(size_t) Py_UNICODE_strlen( 1541 const Py_UNICODE *u 1542 ); 1543 1544PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1545 Py_UNICODE *s1, 1546 const Py_UNICODE *s2); 1547 1548PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 1549 Py_UNICODE *s1, const Py_UNICODE *s2); 1550 1551PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1552 Py_UNICODE *s1, 1553 const Py_UNICODE *s2, 1554 size_t n); 1555 1556PyAPI_FUNC(int) Py_UNICODE_strcmp( 1557 const Py_UNICODE *s1, 1558 const Py_UNICODE *s2 1559 ); 1560 1561PyAPI_FUNC(int) Py_UNICODE_strncmp( 1562 const Py_UNICODE *s1, 1563 const Py_UNICODE *s2, 1564 size_t n 1565 ); 1566 1567PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1568 const Py_UNICODE *s, 1569 Py_UNICODE c 1570 ); 1571 1572PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 1573 const Py_UNICODE *s, 1574 Py_UNICODE c 1575 ); 1576 1577/* Create a copy of a unicode string ending with a nul character. Return NULL 1578 and raise a MemoryError exception on memory allocation failure, otherwise 1579 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 1580 1581PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 1582 PyObject *unicode 1583 ); 1584 1585#ifdef __cplusplus 1586} 1587#endif 1588#endif /* !Py_UNICODEOBJECT_H */ 1589