unicodeobject.h revision 4d0d471a8031de90a2b1ce99c4ac4780e60b3bc9
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal. (See 11http://www.egenix.com/files/python/unicode-proposal.txt). 12 13Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58#include <ctype.h> 59 60/* === Internal API ======================================================= */ 61 62/* --- Internal Unicode Format -------------------------------------------- */ 63 64/* Python 3.x requires unicode */ 65#define Py_USING_UNICODE 66 67/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 68 properly set, but the default rules below doesn't set it. I'll 69 sort this out some other day -- fredrik@pythonware.com */ 70 71#ifndef Py_UNICODE_SIZE 72#error Must define Py_UNICODE_SIZE 73#endif 74 75/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 76 strings are stored as UCS-2 (with limited support for UTF-16) */ 77 78#if Py_UNICODE_SIZE >= 4 79#define Py_UNICODE_WIDE 80#endif 81 82/* Set these flags if the platform has "wchar.h" and the 83 wchar_t type is a 16-bit unsigned type */ 84/* #define HAVE_WCHAR_H */ 85/* #define HAVE_USABLE_WCHAR_T */ 86 87/* Defaults for various platforms */ 88#ifndef PY_UNICODE_TYPE 89 90/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 91# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 92# define HAVE_USABLE_WCHAR_T 93# define PY_UNICODE_TYPE wchar_t 94# endif 95 96# if defined(Py_UNICODE_WIDE) 97# define PY_UNICODE_TYPE Py_UCS4 98# endif 99 100#endif 101 102/* If the compiler provides a wchar_t type we try to support it 103 through the interface functions PyUnicode_FromWideChar(), 104 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 105 106#ifdef HAVE_USABLE_WCHAR_T 107# ifndef HAVE_WCHAR_H 108# define HAVE_WCHAR_H 109# endif 110#endif 111 112#ifdef HAVE_WCHAR_H 113/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 114# ifdef _HAVE_BSDI 115# include <time.h> 116# endif 117# include <wchar.h> 118#endif 119 120/* 121 * Use this typedef when you need to represent a UTF-16 surrogate pair 122 * as single unsigned integer. 123 */ 124#if SIZEOF_INT >= 4 125typedef unsigned int Py_UCS4; 126#elif SIZEOF_LONG >= 4 127typedef unsigned long Py_UCS4; 128#endif 129 130/* Py_UNICODE is the native Unicode storage format (code unit) used by 131 Python and represents a single Unicode element in the Unicode 132 type. */ 133 134#ifndef Py_LIMITED_API 135typedef PY_UNICODE_TYPE Py_UNICODE; 136#endif 137 138/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 139 140/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 141 produce different external names and thus cause import errors in 142 case Python interpreters and extensions with mixed compiled in 143 Unicode width assumptions are combined. */ 144 145#ifndef Py_UNICODE_WIDE 146 147# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 148# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 149# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject 150# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode 151# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject 152# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 153# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode 154# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 155# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 156# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String 157# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 158# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 159# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 160# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 161# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 162# define PyUnicode_AsWideCharString PyUnicodeUCS2_AsWideCharString 163# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist 164# define PyUnicode_Compare PyUnicodeUCS2_Compare 165# define PyUnicode_CompareWithASCIIString PyUnicodeUCS2_CompareWithASCIIString 166# define PyUnicode_Concat PyUnicodeUCS2_Concat 167# define PyUnicode_Append PyUnicodeUCS2_Append 168# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel 169# define PyUnicode_Contains PyUnicodeUCS2_Contains 170# define PyUnicode_Count PyUnicodeUCS2_Count 171# define PyUnicode_Decode PyUnicodeUCS2_Decode 172# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 173# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 174# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 175# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault 176# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize 177# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 178# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 179# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful 180# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 181# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful 182# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 183# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful 184# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 185# define PyUnicode_Encode PyUnicodeUCS2_Encode 186# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 187# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 188# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 189# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 190# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 191# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 192# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 193# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 194# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 195# define PyUnicode_Find PyUnicodeUCS2_Find 196# define PyUnicode_Format PyUnicodeUCS2_Format 197# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 198# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat 199# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV 200# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 201# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal 202# define PyUnicode_FromString PyUnicodeUCS2_FromString 203# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize 204# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 205# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 206# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter 207# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder 208# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 209# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 210# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 211# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier 212# define PyUnicode_Join PyUnicodeUCS2_Join 213# define PyUnicode_Partition PyUnicodeUCS2_Partition 214# define PyUnicode_RPartition PyUnicodeUCS2_RPartition 215# define PyUnicode_RSplit PyUnicodeUCS2_RSplit 216# define PyUnicode_Replace PyUnicodeUCS2_Replace 217# define PyUnicode_Resize PyUnicodeUCS2_Resize 218# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare 219# define PyUnicode_Split PyUnicodeUCS2_Split 220# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 221# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 222# define PyUnicode_Translate PyUnicodeUCS2_Translate 223# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 224# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 225# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 226# define _PyUnicode_Init _PyUnicodeUCS2_Init 227# define PyUnicode_strdup PyUnicodeUCS2_strdup 228 229#else 230 231# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 232# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 233# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject 234# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode 235# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject 236# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 237# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode 238# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 239# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 240# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String 241# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 242# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 243# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 244# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 245# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 246# define PyUnicode_AsWideCharString PyUnicodeUCS4_AsWideCharString 247# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist 248# define PyUnicode_Compare PyUnicodeUCS4_Compare 249# define PyUnicode_CompareWithASCIIString PyUnicodeUCS4_CompareWithASCIIString 250# define PyUnicode_Concat PyUnicodeUCS4_Concat 251# define PyUnicode_Append PyUnicodeUCS4_Append 252# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel 253# define PyUnicode_Contains PyUnicodeUCS4_Contains 254# define PyUnicode_Count PyUnicodeUCS4_Count 255# define PyUnicode_Decode PyUnicodeUCS4_Decode 256# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 257# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 258# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 259# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault 260# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize 261# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 262# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 263# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful 264# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 265# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful 266# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 267# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful 268# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 269# define PyUnicode_Encode PyUnicodeUCS4_Encode 270# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 271# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 272# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 273# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 274# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 275# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 276# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 277# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 278# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 279# define PyUnicode_Find PyUnicodeUCS4_Find 280# define PyUnicode_Format PyUnicodeUCS4_Format 281# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 282# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat 283# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV 284# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 285# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal 286# define PyUnicode_FromString PyUnicodeUCS4_FromString 287# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize 288# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 289# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 290# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter 291# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder 292# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 293# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 294# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 295# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier 296# define PyUnicode_Join PyUnicodeUCS4_Join 297# define PyUnicode_Partition PyUnicodeUCS4_Partition 298# define PyUnicode_RPartition PyUnicodeUCS4_RPartition 299# define PyUnicode_RSplit PyUnicodeUCS4_RSplit 300# define PyUnicode_Replace PyUnicodeUCS4_Replace 301# define PyUnicode_Resize PyUnicodeUCS4_Resize 302# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare 303# define PyUnicode_Split PyUnicodeUCS4_Split 304# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 305# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 306# define PyUnicode_Translate PyUnicodeUCS4_Translate 307# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 308# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 309# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 310# define _PyUnicode_Init _PyUnicodeUCS4_Init 311# define PyUnicode_strdup PyUnicodeUCS4_strdup 312 313#endif 314 315/* --- Internal Unicode Operations ---------------------------------------- */ 316 317/* Since splitting on whitespace is an important use case, and 318 whitespace in most situations is solely ASCII whitespace, we 319 optimize for the common case by using a quick look-up table 320 _Py_ascii_whitespace (see below) with an inlined check. 321 322 */ 323#ifndef Py_LIMITED_API 324#define Py_UNICODE_ISSPACE(ch) \ 325 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 326 327#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 328#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 329#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 330#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 331 332#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 333#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 334#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 335 336#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 337#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 338#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 339#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 340 341#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 342#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 343#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 344 345#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 346 347#define Py_UNICODE_ISALNUM(ch) \ 348 (Py_UNICODE_ISALPHA(ch) || \ 349 Py_UNICODE_ISDECIMAL(ch) || \ 350 Py_UNICODE_ISDIGIT(ch) || \ 351 Py_UNICODE_ISNUMERIC(ch)) 352 353#define Py_UNICODE_COPY(target, source, length) \ 354 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 355 356#define Py_UNICODE_FILL(target, value, length) \ 357 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 358 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 359 } while (0) 360 361/* Check if substring matches at given offset. The offset must be 362 valid, and the substring must not be empty. */ 363 364#define Py_UNICODE_MATCH(string, offset, substring) \ 365 ((*((string)->str + (offset)) == *((substring)->str)) && \ 366 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ 367 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) 368#endif /* Py_LIMITED_API */ 369 370#ifdef __cplusplus 371extern "C" { 372#endif 373 374/* --- Unicode Type ------------------------------------------------------- */ 375 376#ifndef Py_LIMITED_API 377typedef struct { 378 PyObject_HEAD 379 Py_ssize_t length; /* Length of raw Unicode data in buffer */ 380 Py_UNICODE *str; /* Raw Unicode buffer */ 381 Py_hash_t hash; /* Hash value; -1 if not set */ 382 int state; /* != 0 if interned. In this case the two 383 * references from the dictionary to this object 384 * are *not* counted in ob_refcnt. */ 385 PyObject *defenc; /* (Default) Encoded version as Python 386 string, or NULL; this is used for 387 implementing the buffer protocol */ 388} PyUnicodeObject; 389#endif 390 391PyAPI_DATA(PyTypeObject) PyUnicode_Type; 392PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 393 394#define SSTATE_NOT_INTERNED 0 395#define SSTATE_INTERNED_MORTAL 1 396#define SSTATE_INTERNED_IMMORTAL 2 397 398#define PyUnicode_Check(op) \ 399 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 400#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 401 402/* Fast access macros */ 403#ifndef Py_LIMITED_API 404#define PyUnicode_GET_SIZE(op) \ 405 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) 406#define PyUnicode_GET_DATA_SIZE(op) \ 407 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) 408#define PyUnicode_AS_UNICODE(op) \ 409 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) 410#define PyUnicode_AS_DATA(op) \ 411 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str)) 412#endif 413 414/* --- Constants ---------------------------------------------------------- */ 415 416/* This Unicode character will be used as replacement character during 417 decoding if the errors argument is set to "replace". Note: the 418 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 419 Unicode 3.0. */ 420 421#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 422 423/* === Public API ========================================================= */ 424 425/* --- Plain Py_UNICODE --------------------------------------------------- */ 426 427/* Create a Unicode Object from the Py_UNICODE buffer u of the given 428 size. 429 430 u may be NULL which causes the contents to be undefined. It is the 431 user's responsibility to fill in the needed data afterwards. Note 432 that modifying the Unicode object contents after construction is 433 only allowed if u was set to NULL. 434 435 The buffer is copied into the new object. */ 436 437#ifndef Py_LIMITED_API 438PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 439 const Py_UNICODE *u, /* Unicode buffer */ 440 Py_ssize_t size /* size of buffer */ 441 ); 442#endif 443 444/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 445PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 446 const char *u, /* char buffer */ 447 Py_ssize_t size /* size of buffer */ 448 ); 449 450/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 451 UTF-8 encoded bytes */ 452PyAPI_FUNC(PyObject*) PyUnicode_FromString( 453 const char *u /* string */ 454 ); 455 456/* Return a read-only pointer to the Unicode object's internal 457 Py_UNICODE buffer. */ 458 459#ifndef Py_LIMITED_API 460PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 461 PyObject *unicode /* Unicode object */ 462 ); 463#endif 464 465/* Get the length of the Unicode object. */ 466 467PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 468 PyObject *unicode /* Unicode object */ 469 ); 470 471#ifndef Py_LIMITED_API 472/* Get the maximum ordinal for a Unicode character. */ 473PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 474#endif 475 476/* Resize an already allocated Unicode object to the new size length. 477 478 *unicode is modified to point to the new (resized) object and 0 479 returned on success. 480 481 This API may only be called by the function which also called the 482 Unicode constructor. The refcount on the object must be 1. Otherwise, 483 an error is returned. 484 485 Error handling is implemented as follows: an exception is set, -1 486 is returned and *unicode left untouched. 487 488*/ 489 490PyAPI_FUNC(int) PyUnicode_Resize( 491 PyObject **unicode, /* Pointer to the Unicode object */ 492 Py_ssize_t length /* New length */ 493 ); 494 495/* Coerce obj to an Unicode object and return a reference with 496 *incremented* refcount. 497 498 Coercion is done in the following way: 499 500 1. bytes, bytearray and other char buffer compatible objects are decoded 501 under the assumptions that they contain data using the UTF-8 502 encoding. Decoding is done in "strict" mode. 503 504 2. All other objects (including Unicode objects) raise an 505 exception. 506 507 The API returns NULL in case of an error. The caller is responsible 508 for decref'ing the returned objects. 509 510*/ 511 512PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 513 register PyObject *obj, /* Object */ 514 const char *encoding, /* encoding */ 515 const char *errors /* error handling */ 516 ); 517 518/* Coerce obj to an Unicode object and return a reference with 519 *incremented* refcount. 520 521 Unicode objects are passed back as-is (subclasses are converted to 522 true Unicode objects), all other objects are delegated to 523 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 524 using UTF-8 encoding as basis for decoding the object. 525 526 The API returns NULL in case of an error. The caller is responsible 527 for decref'ing the returned objects. 528 529*/ 530 531PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 532 register PyObject *obj /* Object */ 533 ); 534 535PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 536 const char *format, /* ASCII-encoded string */ 537 va_list vargs 538 ); 539PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 540 const char *format, /* ASCII-encoded string */ 541 ... 542 ); 543 544#ifndef Py_LIMITED_API 545/* Format the object based on the format_spec, as defined in PEP 3101 546 (Advanced String Formatting). */ 547PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj, 548 Py_UNICODE *format_spec, 549 Py_ssize_t format_spec_len); 550#endif 551 552PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 553PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 554PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *); 555#ifndef Py_LIMITED_API 556PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 557#endif 558 559/* Use only if you know it's a string */ 560#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) 561 562/* --- wchar_t support for platforms which support it --------------------- */ 563 564#ifdef HAVE_WCHAR_H 565 566/* Create a Unicode Object from the wchar_t buffer w of the given 567 size. 568 569 The buffer is copied into the new object. */ 570 571PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 572 register const wchar_t *w, /* wchar_t buffer */ 573 Py_ssize_t size /* size of buffer */ 574 ); 575 576/* Copies the Unicode Object contents into the wchar_t buffer w. At 577 most size wchar_t characters are copied. 578 579 Note that the resulting wchar_t string may or may not be 580 0-terminated. It is the responsibility of the caller to make sure 581 that the wchar_t string is 0-terminated in case this is required by 582 the application. 583 584 Returns the number of wchar_t characters copied (excluding a 585 possibly trailing 0-termination character) or -1 in case of an 586 error. */ 587 588PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 589 PyObject *unicode, /* Unicode object */ 590 register wchar_t *w, /* wchar_t buffer */ 591 Py_ssize_t size /* size of buffer */ 592 ); 593 594/* Convert the Unicode object to a wide character string. The output string 595 always ends with a nul character. If size is not NULL, write the number of 596 wide characters (including the nul character) into *size. 597 598 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) 599 on success. On error, returns NULL, *size is undefined and raises a 600 MemoryError. */ 601 602PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 603 PyObject *unicode, /* Unicode object */ 604 Py_ssize_t *size /* number of characters of the result */ 605 ); 606 607#endif 608 609/* --- Unicode ordinals --------------------------------------------------- */ 610 611/* Create a Unicode Object from the given Unicode code point ordinal. 612 613 The ordinal must be in range(0x10000) on narrow Python builds 614 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 615 raised in case it is not. 616 617*/ 618 619PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 620 621/* --- Free-list management ----------------------------------------------- */ 622 623/* Clear the free list used by the Unicode implementation. 624 625 This can be used to release memory used for objects on the free 626 list back to the Python memory allocator. 627 628*/ 629 630PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 631 632/* === Builtin Codecs ===================================================== 633 634 Many of these APIs take two arguments encoding and errors. These 635 parameters encoding and errors have the same semantics as the ones 636 of the builtin str() API. 637 638 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 639 640 Error handling is set by errors which may also be set to NULL 641 meaning to use the default handling defined for the codec. Default 642 error handling for all builtin codecs is "strict" (ValueErrors are 643 raised). 644 645 The codecs all use a similar interface. Only deviation from the 646 generic ones are documented. 647 648*/ 649 650/* --- Manage the default encoding ---------------------------------------- */ 651 652/* Return a Python string holding the default encoded value of the 653 Unicode object. 654 655 Same as PyUnicode_AsUTF8String() except 656 the resulting string is cached in the Unicode object for subsequent 657 usage by this function. The cached version is needed to implement 658 the character buffer interface and will live (at least) as long as 659 the Unicode object itself. 660 661 The refcount of the string is *not* incremented. 662 663 *** Exported for internal use by the interpreter only !!! *** 664 665*/ 666 667#ifndef Py_LIMITED_API 668PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( 669 PyObject *unicode, 670 const char *errors); 671#endif 672 673/* Returns a pointer to the default encoding (UTF-8) of the 674 Unicode object unicode and the size of the encoded representation 675 in bytes stored in *size. 676 677 In case of an error, no *size is set. 678 679 *** This API is for interpreter INTERNAL USE ONLY and will likely 680 *** be removed or changed in the future. 681 682 *** If you need to access the Unicode object as UTF-8 bytes string, 683 *** please use PyUnicode_AsUTF8String() instead. 684 685*/ 686 687#ifndef Py_LIMITED_API 688PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize( 689 PyObject *unicode, 690 Py_ssize_t *size); 691#endif 692 693/* Returns a pointer to the default encoding (UTF-8) of the 694 Unicode object unicode. 695 696 Use of this API is DEPRECATED since no size information can be 697 extracted from the returned data. 698 699 *** This API is for interpreter INTERNAL USE ONLY and will likely 700 *** be removed or changed for Python 3.1. 701 702 *** If you need to access the Unicode object as UTF-8 bytes string, 703 *** please use PyUnicode_AsUTF8String() instead. 704 705*/ 706 707#ifndef Py_LIMITED_API 708PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode); 709#endif 710 711/* Returns "utf-8". */ 712 713PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 714 715/* --- Generic Codecs ----------------------------------------------------- */ 716 717/* Create a Unicode object by decoding the encoded string s of the 718 given size. */ 719 720PyAPI_FUNC(PyObject*) PyUnicode_Decode( 721 const char *s, /* encoded string */ 722 Py_ssize_t size, /* size of buffer */ 723 const char *encoding, /* encoding */ 724 const char *errors /* error handling */ 725 ); 726 727/* Decode a Unicode object unicode and return the result as Python 728 object. */ 729 730PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 731 PyObject *unicode, /* Unicode object */ 732 const char *encoding, /* encoding */ 733 const char *errors /* error handling */ 734 ); 735 736/* Decode a Unicode object unicode and return the result as Unicode 737 object. */ 738 739PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 740 PyObject *unicode, /* Unicode object */ 741 const char *encoding, /* encoding */ 742 const char *errors /* error handling */ 743 ); 744 745/* Encodes a Py_UNICODE buffer of the given size and returns a 746 Python string object. */ 747 748#ifndef Py_LIMITED_API 749PyAPI_FUNC(PyObject*) PyUnicode_Encode( 750 const Py_UNICODE *s, /* Unicode char buffer */ 751 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 752 const char *encoding, /* encoding */ 753 const char *errors /* error handling */ 754 ); 755#endif 756 757/* Encodes a Unicode object and returns the result as Python 758 object. */ 759 760PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 761 PyObject *unicode, /* Unicode object */ 762 const char *encoding, /* encoding */ 763 const char *errors /* error handling */ 764 ); 765 766/* Encodes a Unicode object and returns the result as Python string 767 object. */ 768 769PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 770 PyObject *unicode, /* Unicode object */ 771 const char *encoding, /* encoding */ 772 const char *errors /* error handling */ 773 ); 774 775/* Encodes a Unicode object and returns the result as Unicode 776 object. */ 777 778PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 779 PyObject *unicode, /* Unicode object */ 780 const char *encoding, /* encoding */ 781 const char *errors /* error handling */ 782 ); 783 784/* Build an encoding map. */ 785 786PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 787 PyObject* string /* 256 character map */ 788 ); 789 790/* --- UTF-7 Codecs ------------------------------------------------------- */ 791 792PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 793 const char *string, /* UTF-7 encoded string */ 794 Py_ssize_t length, /* size of string */ 795 const char *errors /* error handling */ 796 ); 797 798PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 799 const char *string, /* UTF-7 encoded string */ 800 Py_ssize_t length, /* size of string */ 801 const char *errors, /* error handling */ 802 Py_ssize_t *consumed /* bytes consumed */ 803 ); 804 805#ifndef Py_LIMITED_API 806PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 807 const Py_UNICODE *data, /* Unicode char buffer */ 808 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 809 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 810 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 811 const char *errors /* error handling */ 812 ); 813#endif 814 815/* --- UTF-8 Codecs ------------------------------------------------------- */ 816 817PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 818 const char *string, /* UTF-8 encoded string */ 819 Py_ssize_t length, /* size of string */ 820 const char *errors /* error handling */ 821 ); 822 823PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 824 const char *string, /* UTF-8 encoded string */ 825 Py_ssize_t length, /* size of string */ 826 const char *errors, /* error handling */ 827 Py_ssize_t *consumed /* bytes consumed */ 828 ); 829 830PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 831 PyObject *unicode /* Unicode object */ 832 ); 833 834#ifndef Py_LIMITED_API 835PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 836 const Py_UNICODE *data, /* Unicode char buffer */ 837 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 838 const char *errors /* error handling */ 839 ); 840#endif 841 842/* --- UTF-32 Codecs ------------------------------------------------------ */ 843 844/* Decodes length bytes from a UTF-32 encoded buffer string and returns 845 the corresponding Unicode object. 846 847 errors (if non-NULL) defines the error handling. It defaults 848 to "strict". 849 850 If byteorder is non-NULL, the decoder starts decoding using the 851 given byte order: 852 853 *byteorder == -1: little endian 854 *byteorder == 0: native order 855 *byteorder == 1: big endian 856 857 In native mode, the first four bytes of the stream are checked for a 858 BOM mark. If found, the BOM mark is analysed, the byte order 859 adjusted and the BOM skipped. In the other modes, no BOM mark 860 interpretation is done. After completion, *byteorder is set to the 861 current byte order at the end of input data. 862 863 If byteorder is NULL, the codec starts in native order mode. 864 865*/ 866 867PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 868 const char *string, /* UTF-32 encoded string */ 869 Py_ssize_t length, /* size of string */ 870 const char *errors, /* error handling */ 871 int *byteorder /* pointer to byteorder to use 872 0=native;-1=LE,1=BE; updated on 873 exit */ 874 ); 875 876PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 877 const char *string, /* UTF-32 encoded string */ 878 Py_ssize_t length, /* size of string */ 879 const char *errors, /* error handling */ 880 int *byteorder, /* pointer to byteorder to use 881 0=native;-1=LE,1=BE; updated on 882 exit */ 883 Py_ssize_t *consumed /* bytes consumed */ 884 ); 885 886/* Returns a Python string using the UTF-32 encoding in native byte 887 order. The string always starts with a BOM mark. */ 888 889PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 890 PyObject *unicode /* Unicode object */ 891 ); 892 893/* Returns a Python string object holding the UTF-32 encoded value of 894 the Unicode data. 895 896 If byteorder is not 0, output is written according to the following 897 byte order: 898 899 byteorder == -1: little endian 900 byteorder == 0: native byte order (writes a BOM mark) 901 byteorder == 1: big endian 902 903 If byteorder is 0, the output string will always start with the 904 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 905 prepended. 906 907*/ 908 909#ifndef Py_LIMITED_API 910PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 911 const Py_UNICODE *data, /* Unicode char buffer */ 912 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 913 const char *errors, /* error handling */ 914 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 915 ); 916#endif 917 918/* --- UTF-16 Codecs ------------------------------------------------------ */ 919 920/* Decodes length bytes from a UTF-16 encoded buffer string and returns 921 the corresponding Unicode object. 922 923 errors (if non-NULL) defines the error handling. It defaults 924 to "strict". 925 926 If byteorder is non-NULL, the decoder starts decoding using the 927 given byte order: 928 929 *byteorder == -1: little endian 930 *byteorder == 0: native order 931 *byteorder == 1: big endian 932 933 In native mode, the first two bytes of the stream are checked for a 934 BOM mark. If found, the BOM mark is analysed, the byte order 935 adjusted and the BOM skipped. In the other modes, no BOM mark 936 interpretation is done. After completion, *byteorder is set to the 937 current byte order at the end of input data. 938 939 If byteorder is NULL, the codec starts in native order mode. 940 941*/ 942 943PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 944 const char *string, /* UTF-16 encoded string */ 945 Py_ssize_t length, /* size of string */ 946 const char *errors, /* error handling */ 947 int *byteorder /* pointer to byteorder to use 948 0=native;-1=LE,1=BE; updated on 949 exit */ 950 ); 951 952PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 953 const char *string, /* UTF-16 encoded string */ 954 Py_ssize_t length, /* size of string */ 955 const char *errors, /* error handling */ 956 int *byteorder, /* pointer to byteorder to use 957 0=native;-1=LE,1=BE; updated on 958 exit */ 959 Py_ssize_t *consumed /* bytes consumed */ 960 ); 961 962/* Returns a Python string using the UTF-16 encoding in native byte 963 order. The string always starts with a BOM mark. */ 964 965PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 966 PyObject *unicode /* Unicode object */ 967 ); 968 969/* Returns a Python string object holding the UTF-16 encoded value of 970 the Unicode data. 971 972 If byteorder is not 0, output is written according to the following 973 byte order: 974 975 byteorder == -1: little endian 976 byteorder == 0: native byte order (writes a BOM mark) 977 byteorder == 1: big endian 978 979 If byteorder is 0, the output string will always start with the 980 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 981 prepended. 982 983 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 984 UCS-2. This trick makes it possible to add full UTF-16 capabilities 985 at a later point without compromising the APIs. 986 987*/ 988 989#ifndef Py_LIMITED_API 990PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 991 const Py_UNICODE *data, /* Unicode char buffer */ 992 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 993 const char *errors, /* error handling */ 994 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 995 ); 996#endif 997 998/* --- Unicode-Escape Codecs ---------------------------------------------- */ 999 1000PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1001 const char *string, /* Unicode-Escape encoded string */ 1002 Py_ssize_t length, /* size of string */ 1003 const char *errors /* error handling */ 1004 ); 1005 1006PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1007 PyObject *unicode /* Unicode object */ 1008 ); 1009 1010#ifndef Py_LIMITED_API 1011PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1012 const Py_UNICODE *data, /* Unicode char buffer */ 1013 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1014 ); 1015#endif 1016 1017/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1018 1019PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1020 const char *string, /* Raw-Unicode-Escape encoded string */ 1021 Py_ssize_t length, /* size of string */ 1022 const char *errors /* error handling */ 1023 ); 1024 1025PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1026 PyObject *unicode /* Unicode object */ 1027 ); 1028 1029#ifndef Py_LIMITED_API 1030PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1031 const Py_UNICODE *data, /* Unicode char buffer */ 1032 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1033 ); 1034#endif 1035 1036/* --- Unicode Internal Codec --------------------------------------------- 1037 1038 Only for internal use in _codecsmodule.c */ 1039 1040#ifndef Py_LIMITED_API 1041PyObject *_PyUnicode_DecodeUnicodeInternal( 1042 const char *string, 1043 Py_ssize_t length, 1044 const char *errors 1045 ); 1046#endif 1047 1048/* --- Latin-1 Codecs ----------------------------------------------------- 1049 1050 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1051 1052*/ 1053 1054PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1055 const char *string, /* Latin-1 encoded string */ 1056 Py_ssize_t length, /* size of string */ 1057 const char *errors /* error handling */ 1058 ); 1059 1060PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1061 PyObject *unicode /* Unicode object */ 1062 ); 1063 1064#ifndef Py_LIMITED_API 1065PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1066 const Py_UNICODE *data, /* Unicode char buffer */ 1067 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1068 const char *errors /* error handling */ 1069 ); 1070#endif 1071 1072/* --- ASCII Codecs ------------------------------------------------------- 1073 1074 Only 7-bit ASCII data is excepted. All other codes generate errors. 1075 1076*/ 1077 1078PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1079 const char *string, /* ASCII encoded string */ 1080 Py_ssize_t length, /* size of string */ 1081 const char *errors /* error handling */ 1082 ); 1083 1084PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1085 PyObject *unicode /* Unicode object */ 1086 ); 1087 1088#ifndef Py_LIMITED_API 1089PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1090 const Py_UNICODE *data, /* Unicode char buffer */ 1091 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1092 const char *errors /* error handling */ 1093 ); 1094#endif 1095 1096/* --- Character Map Codecs ----------------------------------------------- 1097 1098 This codec uses mappings to encode and decode characters. 1099 1100 Decoding mappings must map single string characters to single 1101 Unicode characters, integers (which are then interpreted as Unicode 1102 ordinals) or None (meaning "undefined mapping" and causing an 1103 error). 1104 1105 Encoding mappings must map single Unicode characters to single 1106 string characters, integers (which are then interpreted as Latin-1 1107 ordinals) or None (meaning "undefined mapping" and causing an 1108 error). 1109 1110 If a character lookup fails with a LookupError, the character is 1111 copied as-is meaning that its ordinal value will be interpreted as 1112 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1113 to contain those mappings which map characters to different code 1114 points. 1115 1116*/ 1117 1118PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1119 const char *string, /* Encoded string */ 1120 Py_ssize_t length, /* size of string */ 1121 PyObject *mapping, /* character mapping 1122 (char ordinal -> unicode ordinal) */ 1123 const char *errors /* error handling */ 1124 ); 1125 1126PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1127 PyObject *unicode, /* Unicode object */ 1128 PyObject *mapping /* character mapping 1129 (unicode ordinal -> char ordinal) */ 1130 ); 1131 1132#ifndef Py_LIMITED_API 1133PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1134 const Py_UNICODE *data, /* Unicode char buffer */ 1135 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1136 PyObject *mapping, /* character mapping 1137 (unicode ordinal -> char ordinal) */ 1138 const char *errors /* error handling */ 1139 ); 1140#endif 1141 1142/* Translate a Py_UNICODE buffer of the given length by applying a 1143 character mapping table to it and return the resulting Unicode 1144 object. 1145 1146 The mapping table must map Unicode ordinal integers to Unicode 1147 ordinal integers or None (causing deletion of the character). 1148 1149 Mapping tables may be dictionaries or sequences. Unmapped character 1150 ordinals (ones which cause a LookupError) are left untouched and 1151 are copied as-is. 1152 1153*/ 1154 1155#ifndef Py_LIMITED_API 1156PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1157 const Py_UNICODE *data, /* Unicode char buffer */ 1158 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1159 PyObject *table, /* Translate table */ 1160 const char *errors /* error handling */ 1161 ); 1162#endif 1163 1164#ifdef MS_WIN32 1165 1166/* --- MBCS codecs for Windows -------------------------------------------- */ 1167 1168PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1169 const char *string, /* MBCS encoded string */ 1170 Py_ssize_t length, /* size of string */ 1171 const char *errors /* error handling */ 1172 ); 1173 1174PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1175 const char *string, /* MBCS encoded string */ 1176 Py_ssize_t length, /* size of string */ 1177 const char *errors, /* error handling */ 1178 Py_ssize_t *consumed /* bytes consumed */ 1179 ); 1180 1181PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1182 PyObject *unicode /* Unicode object */ 1183 ); 1184 1185#ifndef Py_LIMITED_API 1186PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1187 const Py_UNICODE *data, /* Unicode char buffer */ 1188 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1189 const char *errors /* error handling */ 1190 ); 1191#endif 1192 1193#endif /* MS_WIN32 */ 1194 1195/* --- Decimal Encoder ---------------------------------------------------- */ 1196 1197/* Takes a Unicode string holding a decimal value and writes it into 1198 an output buffer using standard ASCII digit codes. 1199 1200 The output buffer has to provide at least length+1 bytes of storage 1201 area. The output string is 0-terminated. 1202 1203 The encoder converts whitespace to ' ', decimal characters to their 1204 corresponding ASCII digit and all other Latin-1 characters except 1205 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1206 are treated as errors. This includes embedded NULL bytes. 1207 1208 Error handling is defined by the errors argument: 1209 1210 NULL or "strict": raise a ValueError 1211 "ignore": ignore the wrong characters (these are not copied to the 1212 output buffer) 1213 "replace": replaces illegal characters with '?' 1214 1215 Returns 0 on success, -1 on failure. 1216 1217*/ 1218 1219#ifndef Py_LIMITED_API 1220PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1221 Py_UNICODE *s, /* Unicode buffer */ 1222 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1223 char *output, /* Output buffer; must have size >= length */ 1224 const char *errors /* error handling */ 1225 ); 1226#endif 1227 1228/* --- File system encoding ---------------------------------------------- */ 1229 1230/* ParseTuple converter: encode str objects to bytes using 1231 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1232 1233PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1234 1235/* ParseTuple converter: decode bytes objects to unicode using 1236 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1237 1238PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1239 1240/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1241 and the "surrogateescape" error handler. 1242 1243 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1244 encoding. 1245 1246 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1247*/ 1248 1249PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1250 const char *s /* encoded string */ 1251 ); 1252 1253/* Decode a string using Py_FileSystemDefaultEncoding 1254 and the "surrogateescape" error handler. 1255 1256 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1257 encoding. 1258*/ 1259 1260PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1261 const char *s, /* encoded string */ 1262 Py_ssize_t size /* size */ 1263 ); 1264 1265/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1266 "surrogateescape" error handler, and return bytes. 1267 1268 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1269 encoding. 1270*/ 1271 1272PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1273 PyObject *unicode 1274 ); 1275 1276/* --- Methods & Slots ---------------------------------------------------- 1277 1278 These are capable of handling Unicode objects and strings on input 1279 (we refer to them as strings in the descriptions) and return 1280 Unicode objects or integers as apporpriate. */ 1281 1282/* Concat two strings giving a new Unicode string. */ 1283 1284PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1285 PyObject *left, /* Left string */ 1286 PyObject *right /* Right string */ 1287 ); 1288 1289/* Concat two strings and put the result in *pleft 1290 (sets *pleft to NULL on error) */ 1291 1292PyAPI_FUNC(void) PyUnicode_Append( 1293 PyObject **pleft, /* Pointer to left string */ 1294 PyObject *right /* Right string */ 1295 ); 1296 1297/* Concat two strings, put the result in *pleft and drop the right object 1298 (sets *pleft to NULL on error) */ 1299 1300PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1301 PyObject **pleft, /* Pointer to left string */ 1302 PyObject *right /* Right string */ 1303 ); 1304 1305/* Split a string giving a list of Unicode strings. 1306 1307 If sep is NULL, splitting will be done at all whitespace 1308 substrings. Otherwise, splits occur at the given separator. 1309 1310 At most maxsplit splits will be done. If negative, no limit is set. 1311 1312 Separators are not included in the resulting list. 1313 1314*/ 1315 1316PyAPI_FUNC(PyObject*) PyUnicode_Split( 1317 PyObject *s, /* String to split */ 1318 PyObject *sep, /* String separator */ 1319 Py_ssize_t maxsplit /* Maxsplit count */ 1320 ); 1321 1322/* Dito, but split at line breaks. 1323 1324 CRLF is considered to be one line break. Line breaks are not 1325 included in the resulting list. */ 1326 1327PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1328 PyObject *s, /* String to split */ 1329 int keepends /* If true, line end markers are included */ 1330 ); 1331 1332/* Partition a string using a given separator. */ 1333 1334PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1335 PyObject *s, /* String to partition */ 1336 PyObject *sep /* String separator */ 1337 ); 1338 1339/* Partition a string using a given separator, searching from the end of the 1340 string. */ 1341 1342PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1343 PyObject *s, /* String to partition */ 1344 PyObject *sep /* String separator */ 1345 ); 1346 1347/* Split a string giving a list of Unicode strings. 1348 1349 If sep is NULL, splitting will be done at all whitespace 1350 substrings. Otherwise, splits occur at the given separator. 1351 1352 At most maxsplit splits will be done. But unlike PyUnicode_Split 1353 PyUnicode_RSplit splits from the end of the string. If negative, 1354 no limit is set. 1355 1356 Separators are not included in the resulting list. 1357 1358*/ 1359 1360PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1361 PyObject *s, /* String to split */ 1362 PyObject *sep, /* String separator */ 1363 Py_ssize_t maxsplit /* Maxsplit count */ 1364 ); 1365 1366/* Translate a string by applying a character mapping table to it and 1367 return the resulting Unicode object. 1368 1369 The mapping table must map Unicode ordinal integers to Unicode 1370 ordinal integers or None (causing deletion of the character). 1371 1372 Mapping tables may be dictionaries or sequences. Unmapped character 1373 ordinals (ones which cause a LookupError) are left untouched and 1374 are copied as-is. 1375 1376*/ 1377 1378PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1379 PyObject *str, /* String */ 1380 PyObject *table, /* Translate table */ 1381 const char *errors /* error handling */ 1382 ); 1383 1384/* Join a sequence of strings using the given separator and return 1385 the resulting Unicode string. */ 1386 1387PyAPI_FUNC(PyObject*) PyUnicode_Join( 1388 PyObject *separator, /* Separator string */ 1389 PyObject *seq /* Sequence object */ 1390 ); 1391 1392/* Return 1 if substr matches str[start:end] at the given tail end, 0 1393 otherwise. */ 1394 1395PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1396 PyObject *str, /* String */ 1397 PyObject *substr, /* Prefix or Suffix string */ 1398 Py_ssize_t start, /* Start index */ 1399 Py_ssize_t end, /* Stop index */ 1400 int direction /* Tail end: -1 prefix, +1 suffix */ 1401 ); 1402 1403/* Return the first position of substr in str[start:end] using the 1404 given search direction or -1 if not found. -2 is returned in case 1405 an error occurred and an exception is set. */ 1406 1407PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1408 PyObject *str, /* String */ 1409 PyObject *substr, /* Substring to find */ 1410 Py_ssize_t start, /* Start index */ 1411 Py_ssize_t end, /* Stop index */ 1412 int direction /* Find direction: +1 forward, -1 backward */ 1413 ); 1414 1415/* Count the number of occurrences of substr in str[start:end]. */ 1416 1417PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1418 PyObject *str, /* String */ 1419 PyObject *substr, /* Substring to count */ 1420 Py_ssize_t start, /* Start index */ 1421 Py_ssize_t end /* Stop index */ 1422 ); 1423 1424/* Replace at most maxcount occurrences of substr in str with replstr 1425 and return the resulting Unicode object. */ 1426 1427PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1428 PyObject *str, /* String */ 1429 PyObject *substr, /* Substring to find */ 1430 PyObject *replstr, /* Substring to replace */ 1431 Py_ssize_t maxcount /* Max. number of replacements to apply; 1432 -1 = all */ 1433 ); 1434 1435/* Compare two strings and return -1, 0, 1 for less than, equal, 1436 greater than resp. */ 1437 1438PyAPI_FUNC(int) PyUnicode_Compare( 1439 PyObject *left, /* Left string */ 1440 PyObject *right /* Right string */ 1441 ); 1442 1443PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1444 PyObject *left, 1445 const char *right 1446 ); 1447 1448/* Rich compare two strings and return one of the following: 1449 1450 - NULL in case an exception was raised 1451 - Py_True or Py_False for successfuly comparisons 1452 - Py_NotImplemented in case the type combination is unknown 1453 1454 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1455 case the conversion of the arguments to Unicode fails with a 1456 UnicodeDecodeError. 1457 1458 Possible values for op: 1459 1460 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1461 1462*/ 1463 1464PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1465 PyObject *left, /* Left string */ 1466 PyObject *right, /* Right string */ 1467 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1468 ); 1469 1470/* Apply a argument tuple or dictionary to a format string and return 1471 the resulting Unicode string. */ 1472 1473PyAPI_FUNC(PyObject *) PyUnicode_Format( 1474 PyObject *format, /* Format string */ 1475 PyObject *args /* Argument tuple or dictionary */ 1476 ); 1477 1478/* Checks whether element is contained in container and return 1/0 1479 accordingly. 1480 1481 element has to coerce to an one element Unicode string. -1 is 1482 returned in case of an error. */ 1483 1484PyAPI_FUNC(int) PyUnicode_Contains( 1485 PyObject *container, /* Container string */ 1486 PyObject *element /* Element string */ 1487 ); 1488 1489/* Checks whether argument is a valid identifier. */ 1490 1491PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1492 1493#ifndef Py_LIMITED_API 1494/* Externally visible for str.strip(unicode) */ 1495PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1496 PyUnicodeObject *self, 1497 int striptype, 1498 PyObject *sepobj 1499 ); 1500#endif 1501 1502/* Using the current locale, insert the thousands grouping 1503 into the string pointed to by buffer. For the argument descriptions, 1504 see Objects/stringlib/localeutil.h */ 1505 1506#ifndef Py_LIMITED_API 1507PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer, 1508 Py_ssize_t n_buffer, 1509 Py_UNICODE *digits, 1510 Py_ssize_t n_digits, 1511 Py_ssize_t min_width); 1512#endif 1513 1514/* Using explicit passed-in values, insert the thousands grouping 1515 into the string pointed to by buffer. For the argument descriptions, 1516 see Objects/stringlib/localeutil.h */ 1517#ifndef Py_LIMITED_API 1518PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer, 1519 Py_ssize_t n_buffer, 1520 Py_UNICODE *digits, 1521 Py_ssize_t n_digits, 1522 Py_ssize_t min_width, 1523 const char *grouping, 1524 const char *thousands_sep); 1525#endif 1526/* === Characters Type APIs =============================================== */ 1527 1528/* Helper array used by Py_UNICODE_ISSPACE(). */ 1529 1530#ifndef Py_LIMITED_API 1531PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 1532 1533/* These should not be used directly. Use the Py_UNICODE_IS* and 1534 Py_UNICODE_TO* macros instead. 1535 1536 These APIs are implemented in Objects/unicodectype.c. 1537 1538*/ 1539 1540PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1541 Py_UCS4 ch /* Unicode character */ 1542 ); 1543 1544PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1545 Py_UCS4 ch /* Unicode character */ 1546 ); 1547 1548PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1549 Py_UCS4 ch /* Unicode character */ 1550 ); 1551 1552PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1553 Py_UCS4 ch /* Unicode character */ 1554 ); 1555 1556PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1557 Py_UCS4 ch /* Unicode character */ 1558 ); 1559 1560PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1561 const Py_UCS4 ch /* Unicode character */ 1562 ); 1563 1564PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1565 const Py_UCS4 ch /* Unicode character */ 1566 ); 1567 1568PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 1569 Py_UCS4 ch /* Unicode character */ 1570 ); 1571 1572PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 1573 Py_UCS4 ch /* Unicode character */ 1574 ); 1575 1576PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 1577 Py_UCS4 ch /* Unicode character */ 1578 ); 1579 1580PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1581 Py_UCS4 ch /* Unicode character */ 1582 ); 1583 1584PyAPI_FUNC(int) _PyUnicode_ToDigit( 1585 Py_UCS4 ch /* Unicode character */ 1586 ); 1587 1588PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1589 Py_UCS4 ch /* Unicode character */ 1590 ); 1591 1592PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1593 Py_UCS4 ch /* Unicode character */ 1594 ); 1595 1596PyAPI_FUNC(int) _PyUnicode_IsDigit( 1597 Py_UCS4 ch /* Unicode character */ 1598 ); 1599 1600PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1601 Py_UCS4 ch /* Unicode character */ 1602 ); 1603 1604PyAPI_FUNC(int) _PyUnicode_IsPrintable( 1605 Py_UCS4 ch /* Unicode character */ 1606 ); 1607 1608PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1609 Py_UCS4 ch /* Unicode character */ 1610 ); 1611 1612PyAPI_FUNC(size_t) Py_UNICODE_strlen( 1613 const Py_UNICODE *u 1614 ); 1615 1616PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1617 Py_UNICODE *s1, 1618 const Py_UNICODE *s2); 1619 1620PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 1621 Py_UNICODE *s1, const Py_UNICODE *s2); 1622 1623PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1624 Py_UNICODE *s1, 1625 const Py_UNICODE *s2, 1626 size_t n); 1627 1628PyAPI_FUNC(int) Py_UNICODE_strcmp( 1629 const Py_UNICODE *s1, 1630 const Py_UNICODE *s2 1631 ); 1632 1633PyAPI_FUNC(int) Py_UNICODE_strncmp( 1634 const Py_UNICODE *s1, 1635 const Py_UNICODE *s2, 1636 size_t n 1637 ); 1638 1639PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1640 const Py_UNICODE *s, 1641 Py_UNICODE c 1642 ); 1643 1644PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 1645 const Py_UNICODE *s, 1646 Py_UNICODE c 1647 ); 1648 1649/* Create a copy of a unicode string ending with a nul character. Return NULL 1650 and raise a MemoryError exception on memory allocation failure, otherwise 1651 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 1652 1653PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 1654 PyObject *unicode 1655 ); 1656#endif /* Py_LIMITED_API */ 1657 1658#ifdef __cplusplus 1659} 1660#endif 1661#endif /* !Py_UNICODEOBJECT_H */ 1662