unicodeobject.h revision beb4135b8c81e1dbbb841ecd7355ab5a09a3edd2
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal (see file Misc/unicode.txt). 11 12Copyright (c) Corporation for National Research Initiatives. 13 14 15 Original header: 16 -------------------------------------------------------------------- 17 18 * Yet another Unicode string type for Python. This type supports the 19 * 16-bit Basic Multilingual Plane (BMP) only. 20 * 21 * Written by Fredrik Lundh, January 1999. 22 * 23 * Copyright (c) 1999 by Secret Labs AB. 24 * Copyright (c) 1999 by Fredrik Lundh. 25 * 26 * fredrik@pythonware.com 27 * http://www.pythonware.com 28 * 29 * -------------------------------------------------------------------- 30 * This Unicode String Type is 31 * 32 * Copyright (c) 1999 by Secret Labs AB 33 * Copyright (c) 1999 by Fredrik Lundh 34 * 35 * By obtaining, using, and/or copying this software and/or its 36 * associated documentation, you agree that you have read, understood, 37 * and will comply with the following terms and conditions: 38 * 39 * Permission to use, copy, modify, and distribute this software and its 40 * associated documentation for any purpose and without fee is hereby 41 * granted, provided that the above copyright notice appears in all 42 * copies, and that both that copyright notice and this permission notice 43 * appear in supporting documentation, and that the name of Secret Labs 44 * AB or the author not be used in advertising or publicity pertaining to 45 * distribution of the software without specific, written prior 46 * permission. 47 * 48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 50 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 55 * -------------------------------------------------------------------- */ 56 57#include <ctype.h> 58 59/* === Internal API ======================================================= */ 60 61/* --- Internal Unicode Format -------------------------------------------- */ 62 63/* Python 3.x requires unicode */ 64#define Py_USING_UNICODE 65 66/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 67 properly set, but the default rules below doesn't set it. I'll 68 sort this out some other day -- fredrik@pythonware.com */ 69 70#ifndef Py_UNICODE_SIZE 71#error Must define Py_UNICODE_SIZE 72#endif 73 74/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 75 strings are stored as UCS-2 (with limited support for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Defaults for various platforms */ 87#ifndef PY_UNICODE_TYPE 88 89/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 90# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 91# define HAVE_USABLE_WCHAR_T 92# define PY_UNICODE_TYPE wchar_t 93# endif 94 95# if defined(Py_UNICODE_WIDE) 96# define PY_UNICODE_TYPE Py_UCS4 97# endif 98 99#endif 100 101/* If the compiler provides a wchar_t type we try to support it 102 through the interface functions PyUnicode_FromWideChar(), 103 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 104 105#ifdef HAVE_USABLE_WCHAR_T 106# ifndef HAVE_WCHAR_H 107# define HAVE_WCHAR_H 108# endif 109#endif 110 111#ifdef HAVE_WCHAR_H 112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 113# ifdef _HAVE_BSDI 114# include <time.h> 115# endif 116# include <wchar.h> 117#endif 118 119/* 120 * Use this typedef when you need to represent a UTF-16 surrogate pair 121 * as single unsigned integer. 122 */ 123#if SIZEOF_INT >= 4 124typedef unsigned int Py_UCS4; 125#elif SIZEOF_LONG >= 4 126typedef unsigned long Py_UCS4; 127#endif 128 129/* Py_UNICODE is the native Unicode storage format (code unit) used by 130 Python and represents a single Unicode element in the Unicode 131 type. */ 132 133typedef PY_UNICODE_TYPE Py_UNICODE; 134 135/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 136 137/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 138 produce different external names and thus cause import errors in 139 case Python interpreters and extensions with mixed compiled in 140 Unicode width assumptions are combined. */ 141 142#ifndef Py_UNICODE_WIDE 143 144# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 145# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 146# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject 147# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode 148# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject 149# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 150# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode 151# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 152# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 153# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String 154# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 155# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 156# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 157# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 158# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 159# define PyUnicode_AsWideCharString PyUnicodeUCS2_AsWideCharString 160# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist 161# define PyUnicode_Compare PyUnicodeUCS2_Compare 162# define PyUnicode_CompareWithASCII PyUnicodeUCS2_CompareASCII 163# define PyUnicode_Concat PyUnicodeUCS2_Concat 164# define PyUnicode_Append PyUnicodeUCS2_Append 165# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel 166# define PyUnicode_Contains PyUnicodeUCS2_Contains 167# define PyUnicode_Count PyUnicodeUCS2_Count 168# define PyUnicode_Decode PyUnicodeUCS2_Decode 169# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 170# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 171# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 172# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault 173# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize 174# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 175# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 176# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful 177# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 178# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful 179# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 180# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful 181# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 182# define PyUnicode_Encode PyUnicodeUCS2_Encode 183# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 184# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 185# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 186# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 187# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 188# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 189# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 190# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 191# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 192# define PyUnicode_Find PyUnicodeUCS2_Find 193# define PyUnicode_Format PyUnicodeUCS2_Format 194# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 195# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat 196# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV 197# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 198# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal 199# define PyUnicode_FromString PyUnicodeUCS2_FromString 200# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize 201# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 202# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 203# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter 204# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder 205# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 206# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 207# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 208# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier 209# define PyUnicode_Join PyUnicodeUCS2_Join 210# define PyUnicode_Partition PyUnicodeUCS2_Partition 211# define PyUnicode_RPartition PyUnicodeUCS2_RPartition 212# define PyUnicode_RSplit PyUnicodeUCS2_RSplit 213# define PyUnicode_Replace PyUnicodeUCS2_Replace 214# define PyUnicode_Resize PyUnicodeUCS2_Resize 215# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare 216# define PyUnicode_Split PyUnicodeUCS2_Split 217# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 218# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 219# define PyUnicode_Translate PyUnicodeUCS2_Translate 220# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 221# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 222# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 223# define _PyUnicode_Init _PyUnicodeUCS2_Init 224# define PyUnicode_strdup PyUnicodeUCS2_strdup 225 226#else 227 228# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 229# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 230# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject 231# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode 232# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject 233# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 234# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode 235# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 236# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 237# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String 238# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 239# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 240# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 241# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 242# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 243# define PyUnicode_AsWideCharString PyUnicodeUCS4_AsWideCharString 244# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist 245# define PyUnicode_Compare PyUnicodeUCS4_Compare 246# define PyUnicode_CompareWithASCII PyUnicodeUCS4_CompareWithASCII 247# define PyUnicode_Concat PyUnicodeUCS4_Concat 248# define PyUnicode_Append PyUnicodeUCS4_Append 249# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel 250# define PyUnicode_Contains PyUnicodeUCS4_Contains 251# define PyUnicode_Count PyUnicodeUCS4_Count 252# define PyUnicode_Decode PyUnicodeUCS4_Decode 253# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 254# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 255# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 256# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault 257# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize 258# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 259# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 260# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful 261# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 262# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful 263# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 264# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful 265# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 266# define PyUnicode_Encode PyUnicodeUCS4_Encode 267# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 268# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 269# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 270# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 271# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 272# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 273# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 274# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 275# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 276# define PyUnicode_Find PyUnicodeUCS4_Find 277# define PyUnicode_Format PyUnicodeUCS4_Format 278# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 279# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat 280# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV 281# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 282# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal 283# define PyUnicode_FromString PyUnicodeUCS4_FromString 284# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize 285# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 286# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 287# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter 288# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder 289# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 290# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 291# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 292# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier 293# define PyUnicode_Join PyUnicodeUCS4_Join 294# define PyUnicode_Partition PyUnicodeUCS4_Partition 295# define PyUnicode_RPartition PyUnicodeUCS4_RPartition 296# define PyUnicode_RSplit PyUnicodeUCS4_RSplit 297# define PyUnicode_Replace PyUnicodeUCS4_Replace 298# define PyUnicode_Resize PyUnicodeUCS4_Resize 299# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare 300# define PyUnicode_Split PyUnicodeUCS4_Split 301# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 302# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 303# define PyUnicode_Translate PyUnicodeUCS4_Translate 304# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 305# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 306# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 307# define _PyUnicode_Init _PyUnicodeUCS4_Init 308# define PyUnicode_strdup PyUnicodeUCS4_strdup 309 310#endif 311 312/* --- Internal Unicode Operations ---------------------------------------- */ 313 314/* Since splitting on whitespace is an important use case, and 315 whitespace in most situations is solely ASCII whitespace, we 316 optimize for the common case by using a quick look-up table 317 _Py_ascii_whitespace (see below) with an inlined check. 318 319 */ 320#define Py_UNICODE_ISSPACE(ch) \ 321 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 322 323#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 324#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 325#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 326#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 327 328#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 329#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 330#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 331 332#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 333#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 334#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 335#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 336 337#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 338#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 339#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 340 341#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 342 343#define Py_UNICODE_ISALNUM(ch) \ 344 (Py_UNICODE_ISALPHA(ch) || \ 345 Py_UNICODE_ISDECIMAL(ch) || \ 346 Py_UNICODE_ISDIGIT(ch) || \ 347 Py_UNICODE_ISNUMERIC(ch)) 348 349#define Py_UNICODE_COPY(target, source, length) \ 350 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 351 352#define Py_UNICODE_FILL(target, value, length) \ 353 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 354 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 355 } while (0) 356 357/* Check if substring matches at given offset. the offset must be 358 valid, and the substring must not be empty */ 359 360#define Py_UNICODE_MATCH(string, offset, substring) \ 361 ((*((string)->str + (offset)) == *((substring)->str)) && \ 362 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ 363 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) 364 365#ifdef __cplusplus 366extern "C" { 367#endif 368 369/* --- Unicode Type ------------------------------------------------------- */ 370 371typedef struct { 372 PyObject_HEAD 373 Py_ssize_t length; /* Length of raw Unicode data in buffer */ 374 Py_UNICODE *str; /* Raw Unicode buffer */ 375 long hash; /* Hash value; -1 if not set */ 376 int state; /* != 0 if interned. In this case the two 377 * references from the dictionary to this object 378 * are *not* counted in ob_refcnt. */ 379 PyObject *defenc; /* (Default) Encoded version as Python 380 string, or NULL; this is used for 381 implementing the buffer protocol */ 382} PyUnicodeObject; 383 384PyAPI_DATA(PyTypeObject) PyUnicode_Type; 385PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 386 387#define SSTATE_NOT_INTERNED 0 388#define SSTATE_INTERNED_MORTAL 1 389#define SSTATE_INTERNED_IMMORTAL 2 390 391#define PyUnicode_Check(op) \ 392 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 393#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 394 395/* Fast access macros */ 396#define PyUnicode_GET_SIZE(op) \ 397 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) 398#define PyUnicode_GET_DATA_SIZE(op) \ 399 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) 400#define PyUnicode_AS_UNICODE(op) \ 401 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) 402#define PyUnicode_AS_DATA(op) \ 403 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str)) 404 405/* --- Constants ---------------------------------------------------------- */ 406 407/* This Unicode character will be used as replacement character during 408 decoding if the errors argument is set to "replace". Note: the 409 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 410 Unicode 3.0. */ 411 412#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 413 414/* === Public API ========================================================= */ 415 416/* --- Plain Py_UNICODE --------------------------------------------------- */ 417 418/* Create a Unicode Object from the Py_UNICODE buffer u of the given 419 size. 420 421 u may be NULL which causes the contents to be undefined. It is the 422 user's responsibility to fill in the needed data afterwards. Note 423 that modifying the Unicode object contents after construction is 424 only allowed if u was set to NULL. 425 426 The buffer is copied into the new object. */ 427 428PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 429 const Py_UNICODE *u, /* Unicode buffer */ 430 Py_ssize_t size /* size of buffer */ 431 ); 432 433/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 434PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 435 const char *u, /* char buffer */ 436 Py_ssize_t size /* size of buffer */ 437 ); 438 439/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 440 UTF-8 encoded bytes */ 441PyAPI_FUNC(PyObject*) PyUnicode_FromString( 442 const char *u /* string */ 443 ); 444 445/* Return a read-only pointer to the Unicode object's internal 446 Py_UNICODE buffer. */ 447 448PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 449 PyObject *unicode /* Unicode object */ 450 ); 451 452/* Get the length of the Unicode object. */ 453 454PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 455 PyObject *unicode /* Unicode object */ 456 ); 457 458/* Get the maximum ordinal for a Unicode character. */ 459PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 460 461/* Resize an already allocated Unicode object to the new size length. 462 463 *unicode is modified to point to the new (resized) object and 0 464 returned on success. 465 466 This API may only be called by the function which also called the 467 Unicode constructor. The refcount on the object must be 1. Otherwise, 468 an error is returned. 469 470 Error handling is implemented as follows: an exception is set, -1 471 is returned and *unicode left untouched. 472 473*/ 474 475PyAPI_FUNC(int) PyUnicode_Resize( 476 PyObject **unicode, /* Pointer to the Unicode object */ 477 Py_ssize_t length /* New length */ 478 ); 479 480/* Coerce obj to an Unicode object and return a reference with 481 *incremented* refcount. 482 483 Coercion is done in the following way: 484 485 1. bytes, bytearray and other char buffer compatible objects are decoded 486 under the assumptions that they contain data using the current 487 default encoding. Decoding is done in "strict" mode. 488 489 2. All other objects (including Unicode objects) raise an 490 exception. 491 492 The API returns NULL in case of an error. The caller is responsible 493 for decref'ing the returned objects. 494 495*/ 496 497PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 498 register PyObject *obj, /* Object */ 499 const char *encoding, /* encoding */ 500 const char *errors /* error handling */ 501 ); 502 503/* Coerce obj to an Unicode object and return a reference with 504 *incremented* refcount. 505 506 Unicode objects are passed back as-is (subclasses are converted to 507 true Unicode objects), all other objects are delegated to 508 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 509 using UTF-8 encoding as basis for decoding the object. 510 511 The API returns NULL in case of an error. The caller is responsible 512 for decref'ing the returned objects. 513 514*/ 515 516PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 517 register PyObject *obj /* Object */ 518 ); 519 520PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 521 const char *format, /* ASCII-encoded string */ 522 va_list vargs 523 ); 524PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 525 const char *format, /* ASCII-encoded string */ 526 ... 527 ); 528 529/* Format the object based on the format_spec, as defined in PEP 3101 530 (Advanced String Formatting). */ 531PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj, 532 Py_UNICODE *format_spec, 533 Py_ssize_t format_spec_len); 534 535PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 536PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 537PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *); 538PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 539 540/* Use only if you know it's a string */ 541#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) 542 543/* --- wchar_t support for platforms which support it --------------------- */ 544 545#ifdef HAVE_WCHAR_H 546 547/* Create a Unicode Object from the wchar_t buffer w of the given 548 size. 549 550 The buffer is copied into the new object. */ 551 552PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 553 register const wchar_t *w, /* wchar_t buffer */ 554 Py_ssize_t size /* size of buffer */ 555 ); 556 557/* Copies the Unicode Object contents into the wchar_t buffer w. At 558 most size wchar_t characters are copied. 559 560 Note that the resulting wchar_t string may or may not be 561 0-terminated. It is the responsibility of the caller to make sure 562 that the wchar_t string is 0-terminated in case this is required by 563 the application. 564 565 Returns the number of wchar_t characters copied (excluding a 566 possibly trailing 0-termination character) or -1 in case of an 567 error. */ 568 569PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 570 PyUnicodeObject *unicode, /* Unicode object */ 571 register wchar_t *w, /* wchar_t buffer */ 572 Py_ssize_t size /* size of buffer */ 573 ); 574 575/* Convert the Unicode object to a wide character string. The output string 576 always ends with a nul character. If size is not NULL, write the number of 577 wide characters (including the nul character) into *size. 578 579 Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) 580 on success. On error, returns NULL, *size is undefined and raises a 581 MemoryError. */ 582 583PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 584 PyObject *unicode, /* Unicode object */ 585 Py_ssize_t *size /* number of characters of the result */ 586 ); 587 588#endif 589 590/* --- Unicode ordinals --------------------------------------------------- */ 591 592/* Create a Unicode Object from the given Unicode code point ordinal. 593 594 The ordinal must be in range(0x10000) on narrow Python builds 595 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 596 raised in case it is not. 597 598*/ 599 600PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 601 602/* --- Free-list management ----------------------------------------------- */ 603 604/* Clear the free list used by the Unicode implementation. 605 606 This can be used to release memory used for objects on the free 607 list back to the Python memory allocator. 608 609*/ 610 611PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 612 613/* === Builtin Codecs ===================================================== 614 615 Many of these APIs take two arguments encoding and errors. These 616 parameters encoding and errors have the same semantics as the ones 617 of the builtin unicode() API. 618 619 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 620 621 Error handling is set by errors which may also be set to NULL 622 meaning to use the default handling defined for the codec. Default 623 error handling for all builtin codecs is "strict" (ValueErrors are 624 raised). 625 626 The codecs all use a similar interface. Only deviation from the 627 generic ones are documented. 628 629*/ 630 631/* --- Manage the default encoding ---------------------------------------- */ 632 633/* Return a Python string holding the default encoded value of the 634 Unicode object. 635 636 The resulting string is cached in the Unicode object for subsequent 637 usage by this function. The cached version is needed to implement 638 the character buffer interface and will live (at least) as long as 639 the Unicode object itself. 640 641 The refcount of the string is *not* incremented. 642 643 *** Exported for internal use by the interpreter only !!! *** 644 645*/ 646 647PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( 648 PyObject *unicode, 649 const char *errors); 650 651/* Returns a pointer to the default encoding (normally, UTF-8) of the 652 Unicode object unicode and the size of the encoded representation 653 in bytes stored in *size. 654 655 In case of an error, no *size is set. 656 657 *** This API is for interpreter INTERNAL USE ONLY and will likely 658 *** be removed or changed for Python 3.1. 659 660 *** If you need to access the Unicode object as UTF-8 bytes string, 661 *** please use PyUnicode_AsUTF8String() instead. 662 663*/ 664 665PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize( 666 PyObject *unicode, 667 Py_ssize_t *size); 668 669/* Returns a pointer to the default encoding (normally, UTf-8) of the 670 Unicode object unicode. 671 672 Use of this API is DEPRECATED since no size information can be 673 extracted from the returned data. 674 675 *** This API is for interpreter INTERNAL USE ONLY and will likely 676 *** be removed or changed for Python 3.1. 677 678 *** If you need to access the Unicode object as UTF-8 bytes string, 679 *** please use PyUnicode_AsUTF8String() instead. 680 681*/ 682 683PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode); 684 685/* Returns the currently active default encoding. 686 687 The default encoding is currently implemented as run-time settable 688 process global. This may change in future versions of the 689 interpreter to become a parameter which is managed on a per-thread 690 basis. 691 692 */ 693 694PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 695 696/* --- Generic Codecs ----------------------------------------------------- */ 697 698/* Create a Unicode object by decoding the encoded string s of the 699 given size. */ 700 701PyAPI_FUNC(PyObject*) PyUnicode_Decode( 702 const char *s, /* encoded string */ 703 Py_ssize_t size, /* size of buffer */ 704 const char *encoding, /* encoding */ 705 const char *errors /* error handling */ 706 ); 707 708/* Decode a Unicode object unicode and return the result as Python 709 object. */ 710 711PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 712 PyObject *unicode, /* Unicode object */ 713 const char *encoding, /* encoding */ 714 const char *errors /* error handling */ 715 ); 716 717/* Decode a Unicode object unicode and return the result as Unicode 718 object. */ 719 720PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 721 PyObject *unicode, /* Unicode object */ 722 const char *encoding, /* encoding */ 723 const char *errors /* error handling */ 724 ); 725 726/* Encodes a Py_UNICODE buffer of the given size and returns a 727 Python string object. */ 728 729PyAPI_FUNC(PyObject*) PyUnicode_Encode( 730 const Py_UNICODE *s, /* Unicode char buffer */ 731 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 732 const char *encoding, /* encoding */ 733 const char *errors /* error handling */ 734 ); 735 736/* Encodes a Unicode object and returns the result as Python 737 object. */ 738 739PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 740 PyObject *unicode, /* Unicode object */ 741 const char *encoding, /* encoding */ 742 const char *errors /* error handling */ 743 ); 744 745/* Encodes a Unicode object and returns the result as Python string 746 object. */ 747 748PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 749 PyObject *unicode, /* Unicode object */ 750 const char *encoding, /* encoding */ 751 const char *errors /* error handling */ 752 ); 753 754/* Encodes a Unicode object and returns the result as Unicode 755 object. */ 756 757PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 758 PyObject *unicode, /* Unicode object */ 759 const char *encoding, /* encoding */ 760 const char *errors /* error handling */ 761 ); 762 763/* Build an encoding map. */ 764 765PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 766 PyObject* string /* 256 character map */ 767 ); 768 769/* --- UTF-7 Codecs ------------------------------------------------------- */ 770 771PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 772 const char *string, /* UTF-7 encoded string */ 773 Py_ssize_t length, /* size of string */ 774 const char *errors /* error handling */ 775 ); 776 777PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 778 const char *string, /* UTF-7 encoded string */ 779 Py_ssize_t length, /* size of string */ 780 const char *errors, /* error handling */ 781 Py_ssize_t *consumed /* bytes consumed */ 782 ); 783 784PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 785 const Py_UNICODE *data, /* Unicode char buffer */ 786 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 787 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 788 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 789 const char *errors /* error handling */ 790 ); 791 792/* --- UTF-8 Codecs ------------------------------------------------------- */ 793 794PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 795 const char *string, /* UTF-8 encoded string */ 796 Py_ssize_t length, /* size of string */ 797 const char *errors /* error handling */ 798 ); 799 800PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 801 const char *string, /* UTF-8 encoded string */ 802 Py_ssize_t length, /* size of string */ 803 const char *errors, /* error handling */ 804 Py_ssize_t *consumed /* bytes consumed */ 805 ); 806 807PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 808 PyObject *unicode /* Unicode object */ 809 ); 810 811PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 812 const Py_UNICODE *data, /* Unicode char buffer */ 813 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 814 const char *errors /* error handling */ 815 ); 816 817/* --- UTF-32 Codecs ------------------------------------------------------ */ 818 819/* Decodes length bytes from a UTF-32 encoded buffer string and returns 820 the corresponding Unicode object. 821 822 errors (if non-NULL) defines the error handling. It defaults 823 to "strict". 824 825 If byteorder is non-NULL, the decoder starts decoding using the 826 given byte order: 827 828 *byteorder == -1: little endian 829 *byteorder == 0: native order 830 *byteorder == 1: big endian 831 832 In native mode, the first four bytes of the stream are checked for a 833 BOM mark. If found, the BOM mark is analysed, the byte order 834 adjusted and the BOM skipped. In the other modes, no BOM mark 835 interpretation is done. After completion, *byteorder is set to the 836 current byte order at the end of input data. 837 838 If byteorder is NULL, the codec starts in native order mode. 839 840*/ 841 842PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 843 const char *string, /* UTF-32 encoded string */ 844 Py_ssize_t length, /* size of string */ 845 const char *errors, /* error handling */ 846 int *byteorder /* pointer to byteorder to use 847 0=native;-1=LE,1=BE; updated on 848 exit */ 849 ); 850 851PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 852 const char *string, /* UTF-32 encoded string */ 853 Py_ssize_t length, /* size of string */ 854 const char *errors, /* error handling */ 855 int *byteorder, /* pointer to byteorder to use 856 0=native;-1=LE,1=BE; updated on 857 exit */ 858 Py_ssize_t *consumed /* bytes consumed */ 859 ); 860 861/* Returns a Python string using the UTF-32 encoding in native byte 862 order. The string always starts with a BOM mark. */ 863 864PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 865 PyObject *unicode /* Unicode object */ 866 ); 867 868/* Returns a Python string object holding the UTF-32 encoded value of 869 the Unicode data. 870 871 If byteorder is not 0, output is written according to the following 872 byte order: 873 874 byteorder == -1: little endian 875 byteorder == 0: native byte order (writes a BOM mark) 876 byteorder == 1: big endian 877 878 If byteorder is 0, the output string will always start with the 879 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 880 prepended. 881 882*/ 883 884PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 885 const Py_UNICODE *data, /* Unicode char buffer */ 886 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 887 const char *errors, /* error handling */ 888 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 889 ); 890 891/* --- UTF-16 Codecs ------------------------------------------------------ */ 892 893/* Decodes length bytes from a UTF-16 encoded buffer string and returns 894 the corresponding Unicode object. 895 896 errors (if non-NULL) defines the error handling. It defaults 897 to "strict". 898 899 If byteorder is non-NULL, the decoder starts decoding using the 900 given byte order: 901 902 *byteorder == -1: little endian 903 *byteorder == 0: native order 904 *byteorder == 1: big endian 905 906 In native mode, the first two bytes of the stream are checked for a 907 BOM mark. If found, the BOM mark is analysed, the byte order 908 adjusted and the BOM skipped. In the other modes, no BOM mark 909 interpretation is done. After completion, *byteorder is set to the 910 current byte order at the end of input data. 911 912 If byteorder is NULL, the codec starts in native order mode. 913 914*/ 915 916PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 917 const char *string, /* UTF-16 encoded string */ 918 Py_ssize_t length, /* size of string */ 919 const char *errors, /* error handling */ 920 int *byteorder /* pointer to byteorder to use 921 0=native;-1=LE,1=BE; updated on 922 exit */ 923 ); 924 925PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 926 const char *string, /* UTF-16 encoded string */ 927 Py_ssize_t length, /* size of string */ 928 const char *errors, /* error handling */ 929 int *byteorder, /* pointer to byteorder to use 930 0=native;-1=LE,1=BE; updated on 931 exit */ 932 Py_ssize_t *consumed /* bytes consumed */ 933 ); 934 935/* Returns a Python string using the UTF-16 encoding in native byte 936 order. The string always starts with a BOM mark. */ 937 938PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 939 PyObject *unicode /* Unicode object */ 940 ); 941 942/* Returns a Python string object holding the UTF-16 encoded value of 943 the Unicode data. 944 945 If byteorder is not 0, output is written according to the following 946 byte order: 947 948 byteorder == -1: little endian 949 byteorder == 0: native byte order (writes a BOM mark) 950 byteorder == 1: big endian 951 952 If byteorder is 0, the output string will always start with the 953 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 954 prepended. 955 956 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 957 UCS-2. This trick makes it possible to add full UTF-16 capabilities 958 at a later point without compromising the APIs. 959 960*/ 961 962PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 963 const Py_UNICODE *data, /* Unicode char buffer */ 964 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 965 const char *errors, /* error handling */ 966 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 967 ); 968 969/* --- Unicode-Escape Codecs ---------------------------------------------- */ 970 971PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 972 const char *string, /* Unicode-Escape encoded string */ 973 Py_ssize_t length, /* size of string */ 974 const char *errors /* error handling */ 975 ); 976 977PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 978 PyObject *unicode /* Unicode object */ 979 ); 980 981PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 982 const Py_UNICODE *data, /* Unicode char buffer */ 983 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 984 ); 985 986/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 987 988PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 989 const char *string, /* Raw-Unicode-Escape encoded string */ 990 Py_ssize_t length, /* size of string */ 991 const char *errors /* error handling */ 992 ); 993 994PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 995 PyObject *unicode /* Unicode object */ 996 ); 997 998PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 999 const Py_UNICODE *data, /* Unicode char buffer */ 1000 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1001 ); 1002 1003/* --- Unicode Internal Codec --------------------------------------------- 1004 1005 Only for internal use in _codecsmodule.c */ 1006 1007PyObject *_PyUnicode_DecodeUnicodeInternal( 1008 const char *string, 1009 Py_ssize_t length, 1010 const char *errors 1011 ); 1012 1013/* --- Latin-1 Codecs ----------------------------------------------------- 1014 1015 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1016 1017*/ 1018 1019PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1020 const char *string, /* Latin-1 encoded string */ 1021 Py_ssize_t length, /* size of string */ 1022 const char *errors /* error handling */ 1023 ); 1024 1025PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1026 PyObject *unicode /* Unicode object */ 1027 ); 1028 1029PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1030 const Py_UNICODE *data, /* Unicode char buffer */ 1031 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1032 const char *errors /* error handling */ 1033 ); 1034 1035/* --- ASCII Codecs ------------------------------------------------------- 1036 1037 Only 7-bit ASCII data is excepted. All other codes generate errors. 1038 1039*/ 1040 1041PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1042 const char *string, /* ASCII encoded string */ 1043 Py_ssize_t length, /* size of string */ 1044 const char *errors /* error handling */ 1045 ); 1046 1047PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1048 PyObject *unicode /* Unicode object */ 1049 ); 1050 1051PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1052 const Py_UNICODE *data, /* Unicode char buffer */ 1053 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1054 const char *errors /* error handling */ 1055 ); 1056 1057/* --- Character Map Codecs ----------------------------------------------- 1058 1059 This codec uses mappings to encode and decode characters. 1060 1061 Decoding mappings must map single string characters to single 1062 Unicode characters, integers (which are then interpreted as Unicode 1063 ordinals) or None (meaning "undefined mapping" and causing an 1064 error). 1065 1066 Encoding mappings must map single Unicode characters to single 1067 string characters, integers (which are then interpreted as Latin-1 1068 ordinals) or None (meaning "undefined mapping" and causing an 1069 error). 1070 1071 If a character lookup fails with a LookupError, the character is 1072 copied as-is meaning that its ordinal value will be interpreted as 1073 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1074 to contain those mappings which map characters to different code 1075 points. 1076 1077*/ 1078 1079PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1080 const char *string, /* Encoded string */ 1081 Py_ssize_t length, /* size of string */ 1082 PyObject *mapping, /* character mapping 1083 (char ordinal -> unicode ordinal) */ 1084 const char *errors /* error handling */ 1085 ); 1086 1087PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1088 PyObject *unicode, /* Unicode object */ 1089 PyObject *mapping /* character mapping 1090 (unicode ordinal -> char ordinal) */ 1091 ); 1092 1093PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1094 const Py_UNICODE *data, /* Unicode char buffer */ 1095 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1096 PyObject *mapping, /* character mapping 1097 (unicode ordinal -> char ordinal) */ 1098 const char *errors /* error handling */ 1099 ); 1100 1101/* Translate a Py_UNICODE buffer of the given length by applying a 1102 character mapping table to it and return the resulting Unicode 1103 object. 1104 1105 The mapping table must map Unicode ordinal integers to Unicode 1106 ordinal integers or None (causing deletion of the character). 1107 1108 Mapping tables may be dictionaries or sequences. Unmapped character 1109 ordinals (ones which cause a LookupError) are left untouched and 1110 are copied as-is. 1111 1112*/ 1113 1114PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1115 const Py_UNICODE *data, /* Unicode char buffer */ 1116 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1117 PyObject *table, /* Translate table */ 1118 const char *errors /* error handling */ 1119 ); 1120 1121#ifdef MS_WIN32 1122 1123/* --- MBCS codecs for Windows -------------------------------------------- */ 1124 1125PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1126 const char *string, /* MBCS encoded string */ 1127 Py_ssize_t length, /* size of string */ 1128 const char *errors /* error handling */ 1129 ); 1130 1131PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1132 const char *string, /* MBCS encoded string */ 1133 Py_ssize_t length, /* size of string */ 1134 const char *errors, /* error handling */ 1135 Py_ssize_t *consumed /* bytes consumed */ 1136 ); 1137 1138PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1139 PyObject *unicode /* Unicode object */ 1140 ); 1141 1142PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1143 const Py_UNICODE *data, /* Unicode char buffer */ 1144 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1145 const char *errors /* error handling */ 1146 ); 1147 1148#endif /* MS_WIN32 */ 1149 1150/* --- Decimal Encoder ---------------------------------------------------- */ 1151 1152/* Takes a Unicode string holding a decimal value and writes it into 1153 an output buffer using standard ASCII digit codes. 1154 1155 The output buffer has to provide at least length+1 bytes of storage 1156 area. The output string is 0-terminated. 1157 1158 The encoder converts whitespace to ' ', decimal characters to their 1159 corresponding ASCII digit and all other Latin-1 characters except 1160 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1161 are treated as errors. This includes embedded NULL bytes. 1162 1163 Error handling is defined by the errors argument: 1164 1165 NULL or "strict": raise a ValueError 1166 "ignore": ignore the wrong characters (these are not copied to the 1167 output buffer) 1168 "replace": replaces illegal characters with '?' 1169 1170 Returns 0 on success, -1 on failure. 1171 1172*/ 1173 1174PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1175 Py_UNICODE *s, /* Unicode buffer */ 1176 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1177 char *output, /* Output buffer; must have size >= length */ 1178 const char *errors /* error handling */ 1179 ); 1180 1181/* --- File system encoding ---------------------------------------------- */ 1182 1183/* ParseTuple converter: encode str objects to bytes using 1184 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1185 1186PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1187 1188/* ParseTuple converter: decode bytes objects to unicode using 1189 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1190 1191PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1192 1193/* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1194 and the "surrogateescape" error handler. 1195 1196 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. 1197 1198 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1199*/ 1200 1201PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1202 const char *s /* encoded string */ 1203 ); 1204 1205/* Decode a string using Py_FileSystemDefaultEncoding 1206 and the "surrogateescape" error handler. 1207 1208 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. 1209*/ 1210 1211PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1212 const char *s, /* encoded string */ 1213 Py_ssize_t size /* size */ 1214 ); 1215 1216/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1217 "surrogateescape" error handler, and return bytes. 1218 1219 If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. 1220*/ 1221 1222PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1223 PyObject *unicode 1224 ); 1225 1226/* --- Methods & Slots ---------------------------------------------------- 1227 1228 These are capable of handling Unicode objects and strings on input 1229 (we refer to them as strings in the descriptions) and return 1230 Unicode objects or integers as apporpriate. */ 1231 1232/* Concat two strings giving a new Unicode string. */ 1233 1234PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1235 PyObject *left, /* Left string */ 1236 PyObject *right /* Right string */ 1237 ); 1238 1239/* Concat two strings and put the result in *pleft 1240 (sets *pleft to NULL on error) */ 1241 1242PyAPI_FUNC(void) PyUnicode_Append( 1243 PyObject **pleft, /* Pointer to left string */ 1244 PyObject *right /* Right string */ 1245 ); 1246 1247/* Concat two strings, put the result in *pleft and drop the right object 1248 (sets *pleft to NULL on error) */ 1249 1250PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1251 PyObject **pleft, /* Pointer to left string */ 1252 PyObject *right /* Right string */ 1253 ); 1254 1255/* Split a string giving a list of Unicode strings. 1256 1257 If sep is NULL, splitting will be done at all whitespace 1258 substrings. Otherwise, splits occur at the given separator. 1259 1260 At most maxsplit splits will be done. If negative, no limit is set. 1261 1262 Separators are not included in the resulting list. 1263 1264*/ 1265 1266PyAPI_FUNC(PyObject*) PyUnicode_Split( 1267 PyObject *s, /* String to split */ 1268 PyObject *sep, /* String separator */ 1269 Py_ssize_t maxsplit /* Maxsplit count */ 1270 ); 1271 1272/* Dito, but split at line breaks. 1273 1274 CRLF is considered to be one line break. Line breaks are not 1275 included in the resulting list. */ 1276 1277PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1278 PyObject *s, /* String to split */ 1279 int keepends /* If true, line end markers are included */ 1280 ); 1281 1282/* Partition a string using a given separator. */ 1283 1284PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1285 PyObject *s, /* String to partition */ 1286 PyObject *sep /* String separator */ 1287 ); 1288 1289/* Partition a string using a given separator, searching from the end of the 1290 string. */ 1291 1292PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1293 PyObject *s, /* String to partition */ 1294 PyObject *sep /* String separator */ 1295 ); 1296 1297/* Split a string giving a list of Unicode strings. 1298 1299 If sep is NULL, splitting will be done at all whitespace 1300 substrings. Otherwise, splits occur at the given separator. 1301 1302 At most maxsplit splits will be done. But unlike PyUnicode_Split 1303 PyUnicode_RSplit splits from the end of the string. If negative, 1304 no limit is set. 1305 1306 Separators are not included in the resulting list. 1307 1308*/ 1309 1310PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1311 PyObject *s, /* String to split */ 1312 PyObject *sep, /* String separator */ 1313 Py_ssize_t maxsplit /* Maxsplit count */ 1314 ); 1315 1316/* Translate a string by applying a character mapping table to it and 1317 return the resulting Unicode object. 1318 1319 The mapping table must map Unicode ordinal integers to Unicode 1320 ordinal integers or None (causing deletion of the character). 1321 1322 Mapping tables may be dictionaries or sequences. Unmapped character 1323 ordinals (ones which cause a LookupError) are left untouched and 1324 are copied as-is. 1325 1326*/ 1327 1328PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1329 PyObject *str, /* String */ 1330 PyObject *table, /* Translate table */ 1331 const char *errors /* error handling */ 1332 ); 1333 1334/* Join a sequence of strings using the given separator and return 1335 the resulting Unicode string. */ 1336 1337PyAPI_FUNC(PyObject*) PyUnicode_Join( 1338 PyObject *separator, /* Separator string */ 1339 PyObject *seq /* Sequence object */ 1340 ); 1341 1342/* Return 1 if substr matches str[start:end] at the given tail end, 0 1343 otherwise. */ 1344 1345PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1346 PyObject *str, /* String */ 1347 PyObject *substr, /* Prefix or Suffix string */ 1348 Py_ssize_t start, /* Start index */ 1349 Py_ssize_t end, /* Stop index */ 1350 int direction /* Tail end: -1 prefix, +1 suffix */ 1351 ); 1352 1353/* Return the first position of substr in str[start:end] using the 1354 given search direction or -1 if not found. -2 is returned in case 1355 an error occurred and an exception is set. */ 1356 1357PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1358 PyObject *str, /* String */ 1359 PyObject *substr, /* Substring to find */ 1360 Py_ssize_t start, /* Start index */ 1361 Py_ssize_t end, /* Stop index */ 1362 int direction /* Find direction: +1 forward, -1 backward */ 1363 ); 1364 1365/* Count the number of occurrences of substr in str[start:end]. */ 1366 1367PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1368 PyObject *str, /* String */ 1369 PyObject *substr, /* Substring to count */ 1370 Py_ssize_t start, /* Start index */ 1371 Py_ssize_t end /* Stop index */ 1372 ); 1373 1374/* Replace at most maxcount occurrences of substr in str with replstr 1375 and return the resulting Unicode object. */ 1376 1377PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1378 PyObject *str, /* String */ 1379 PyObject *substr, /* Substring to find */ 1380 PyObject *replstr, /* Substring to replace */ 1381 Py_ssize_t maxcount /* Max. number of replacements to apply; 1382 -1 = all */ 1383 ); 1384 1385/* Compare two strings and return -1, 0, 1 for less than, equal, 1386 greater than resp. */ 1387 1388PyAPI_FUNC(int) PyUnicode_Compare( 1389 PyObject *left, /* Left string */ 1390 PyObject *right /* Right string */ 1391 ); 1392 1393PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1394 PyObject *left, 1395 const char *right 1396 ); 1397 1398/* Rich compare two strings and return one of the following: 1399 1400 - NULL in case an exception was raised 1401 - Py_True or Py_False for successfuly comparisons 1402 - Py_NotImplemented in case the type combination is unknown 1403 1404 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1405 case the conversion of the arguments to Unicode fails with a 1406 UnicodeDecodeError. 1407 1408 Possible values for op: 1409 1410 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1411 1412*/ 1413 1414PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1415 PyObject *left, /* Left string */ 1416 PyObject *right, /* Right string */ 1417 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1418 ); 1419 1420/* Apply a argument tuple or dictionary to a format string and return 1421 the resulting Unicode string. */ 1422 1423PyAPI_FUNC(PyObject *) PyUnicode_Format( 1424 PyObject *format, /* Format string */ 1425 PyObject *args /* Argument tuple or dictionary */ 1426 ); 1427 1428/* Checks whether element is contained in container and return 1/0 1429 accordingly. 1430 1431 element has to coerce to an one element Unicode string. -1 is 1432 returned in case of an error. */ 1433 1434PyAPI_FUNC(int) PyUnicode_Contains( 1435 PyObject *container, /* Container string */ 1436 PyObject *element /* Element string */ 1437 ); 1438 1439/* Checks whether argument is a valid identifier. */ 1440 1441PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1442 1443/* Externally visible for str.strip(unicode) */ 1444PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1445 PyUnicodeObject *self, 1446 int striptype, 1447 PyObject *sepobj 1448 ); 1449 1450/* Using the current locale, insert the thousands grouping 1451 into the string pointed to by buffer. For the argument descriptions, 1452 see Objects/stringlib/localeutil.h */ 1453 1454PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer, 1455 Py_ssize_t n_buffer, 1456 Py_UNICODE *digits, 1457 Py_ssize_t n_digits, 1458 Py_ssize_t min_width); 1459 1460/* Using explicit passed-in values, insert the thousands grouping 1461 into the string pointed to by buffer. For the argument descriptions, 1462 see Objects/stringlib/localeutil.h */ 1463PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer, 1464 Py_ssize_t n_buffer, 1465 Py_UNICODE *digits, 1466 Py_ssize_t n_digits, 1467 Py_ssize_t min_width, 1468 const char *grouping, 1469 const char *thousands_sep); 1470/* === Characters Type APIs =============================================== */ 1471 1472/* Helper array used by Py_UNICODE_ISSPACE(). */ 1473 1474PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 1475 1476/* These should not be used directly. Use the Py_UNICODE_IS* and 1477 Py_UNICODE_TO* macros instead. 1478 1479 These APIs are implemented in Objects/unicodectype.c. 1480 1481*/ 1482 1483PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1484 Py_UCS4 ch /* Unicode character */ 1485 ); 1486 1487PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1488 Py_UCS4 ch /* Unicode character */ 1489 ); 1490 1491PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1492 Py_UCS4 ch /* Unicode character */ 1493 ); 1494 1495PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1496 Py_UCS4 ch /* Unicode character */ 1497 ); 1498 1499PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1500 Py_UCS4 ch /* Unicode character */ 1501 ); 1502 1503PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1504 const Py_UCS4 ch /* Unicode character */ 1505 ); 1506 1507PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1508 const Py_UCS4 ch /* Unicode character */ 1509 ); 1510 1511PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 1512 Py_UCS4 ch /* Unicode character */ 1513 ); 1514 1515PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 1516 Py_UCS4 ch /* Unicode character */ 1517 ); 1518 1519PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 1520 Py_UCS4 ch /* Unicode character */ 1521 ); 1522 1523PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1524 Py_UCS4 ch /* Unicode character */ 1525 ); 1526 1527PyAPI_FUNC(int) _PyUnicode_ToDigit( 1528 Py_UCS4 ch /* Unicode character */ 1529 ); 1530 1531PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1532 Py_UCS4 ch /* Unicode character */ 1533 ); 1534 1535PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1536 Py_UCS4 ch /* Unicode character */ 1537 ); 1538 1539PyAPI_FUNC(int) _PyUnicode_IsDigit( 1540 Py_UCS4 ch /* Unicode character */ 1541 ); 1542 1543PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1544 Py_UCS4 ch /* Unicode character */ 1545 ); 1546 1547PyAPI_FUNC(int) _PyUnicode_IsPrintable( 1548 Py_UCS4 ch /* Unicode character */ 1549 ); 1550 1551PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1552 Py_UCS4 ch /* Unicode character */ 1553 ); 1554 1555PyAPI_FUNC(size_t) Py_UNICODE_strlen( 1556 const Py_UNICODE *u 1557 ); 1558 1559PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1560 Py_UNICODE *s1, 1561 const Py_UNICODE *s2); 1562 1563PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 1564 Py_UNICODE *s1, const Py_UNICODE *s2); 1565 1566PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1567 Py_UNICODE *s1, 1568 const Py_UNICODE *s2, 1569 size_t n); 1570 1571PyAPI_FUNC(int) Py_UNICODE_strcmp( 1572 const Py_UNICODE *s1, 1573 const Py_UNICODE *s2 1574 ); 1575 1576PyAPI_FUNC(int) Py_UNICODE_strncmp( 1577 const Py_UNICODE *s1, 1578 const Py_UNICODE *s2, 1579 size_t n 1580 ); 1581 1582PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1583 const Py_UNICODE *s, 1584 Py_UNICODE c 1585 ); 1586 1587PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 1588 const Py_UNICODE *s, 1589 Py_UNICODE c 1590 ); 1591 1592/* Create a copy of a unicode string ending with a nul character. Return NULL 1593 and raise a MemoryError exception on memory allocation failure, otherwise 1594 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 1595 1596PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 1597 PyObject *unicode 1598 ); 1599 1600#ifdef __cplusplus 1601} 1602#endif 1603#endif /* !Py_UNICODEOBJECT_H */ 1604