unicodeobject.h revision 190d79e5c648174b550de2bef75d1b4addf0d625
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal (see file Misc/unicode.txt). 11 12Copyright (c) Corporation for National Research Initiatives. 13 14 15 Original header: 16 -------------------------------------------------------------------- 17 18 * Yet another Unicode string type for Python. This type supports the 19 * 16-bit Basic Multilingual Plane (BMP) only. 20 * 21 * Written by Fredrik Lundh, January 1999. 22 * 23 * Copyright (c) 1999 by Secret Labs AB. 24 * Copyright (c) 1999 by Fredrik Lundh. 25 * 26 * fredrik@pythonware.com 27 * http://www.pythonware.com 28 * 29 * -------------------------------------------------------------------- 30 * This Unicode String Type is 31 * 32 * Copyright (c) 1999 by Secret Labs AB 33 * Copyright (c) 1999 by Fredrik Lundh 34 * 35 * By obtaining, using, and/or copying this software and/or its 36 * associated documentation, you agree that you have read, understood, 37 * and will comply with the following terms and conditions: 38 * 39 * Permission to use, copy, modify, and distribute this software and its 40 * associated documentation for any purpose and without fee is hereby 41 * granted, provided that the above copyright notice appears in all 42 * copies, and that both that copyright notice and this permission notice 43 * appear in supporting documentation, and that the name of Secret Labs 44 * AB or the author not be used in advertising or publicity pertaining to 45 * distribution of the software without specific, written prior 46 * permission. 47 * 48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 50 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 55 * -------------------------------------------------------------------- */ 56 57#include <ctype.h> 58 59/* === Internal API ======================================================= */ 60 61/* --- Internal Unicode Format -------------------------------------------- */ 62 63/* Python 3.x requires unicode */ 64#define Py_USING_UNICODE 65 66/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 67 properly set, but the default rules below doesn't set it. I'll 68 sort this out some other day -- fredrik@pythonware.com */ 69 70#ifndef Py_UNICODE_SIZE 71#error Must define Py_UNICODE_SIZE 72#endif 73 74/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 75 strings are stored as UCS-2 (with limited support for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h", "wctype.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Defaults for various platforms */ 87#ifndef PY_UNICODE_TYPE 88 89/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 90# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 91# define HAVE_USABLE_WCHAR_T 92# define PY_UNICODE_TYPE wchar_t 93# endif 94 95# if defined(Py_UNICODE_WIDE) 96# define PY_UNICODE_TYPE Py_UCS4 97# endif 98 99#endif 100 101/* If the compiler provides a wchar_t type we try to support it 102 through the interface functions PyUnicode_FromWideChar() and 103 PyUnicode_AsWideChar(). */ 104 105#ifdef HAVE_USABLE_WCHAR_T 106# ifndef HAVE_WCHAR_H 107# define HAVE_WCHAR_H 108# endif 109#endif 110 111#ifdef HAVE_WCHAR_H 112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 113# ifdef _HAVE_BSDI 114# include <time.h> 115# endif 116# include <wchar.h> 117#endif 118 119/* 120 * Use this typedef when you need to represent a UTF-16 surrogate pair 121 * as single unsigned integer. 122 */ 123#if SIZEOF_INT >= 4 124typedef unsigned int Py_UCS4; 125#elif SIZEOF_LONG >= 4 126typedef unsigned long Py_UCS4; 127#endif 128 129typedef PY_UNICODE_TYPE Py_UNICODE; 130 131/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 132 133/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 134 produce different external names and thus cause import errors in 135 case Python interpreters and extensions with mixed compiled in 136 Unicode width assumptions are combined. */ 137 138#ifndef Py_UNICODE_WIDE 139 140# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 141# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 142# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject 143# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 144# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 145# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 146# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String 147# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 148# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 149# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 150# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 151# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 152# define PyUnicode_Compare PyUnicodeUCS2_Compare 153# define PyUnicode_Concat PyUnicodeUCS2_Concat 154# define PyUnicode_Append PyUnicodeUCS2_Append 155# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel 156# define PyUnicode_Contains PyUnicodeUCS2_Contains 157# define PyUnicode_Count PyUnicodeUCS2_Count 158# define PyUnicode_Decode PyUnicodeUCS2_Decode 159# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 160# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 161# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 162# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault 163# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize 164# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 165# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 166# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful 167# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 168# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful 169# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 170# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful 171# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 172# define PyUnicode_Encode PyUnicodeUCS2_Encode 173# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 174# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 175# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 176# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 177# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 178# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 179# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 180# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 181# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 182# define PyUnicode_Find PyUnicodeUCS2_Find 183# define PyUnicode_Format PyUnicodeUCS2_Format 184# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 185# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 186# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal 187# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 188# define PyUnicode_FromString PyUnicodeUCS2_FromString 189# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize 190# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV 191# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat 192# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 193# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 194# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 195# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 196# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier 197# define PyUnicode_Join PyUnicodeUCS2_Join 198# define PyUnicode_Partition PyUnicodeUCS2_Partition 199# define PyUnicode_RPartition PyUnicodeUCS2_RPartition 200# define PyUnicode_RSplit PyUnicodeUCS2_RSplit 201# define PyUnicode_Replace PyUnicodeUCS2_Replace 202# define PyUnicode_Resize PyUnicodeUCS2_Resize 203# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare 204# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding 205# define PyUnicode_Split PyUnicodeUCS2_Split 206# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 207# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 208# define PyUnicode_Translate PyUnicodeUCS2_Translate 209# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 210# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 211# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 212# define _PyUnicode_Init _PyUnicodeUCS2_Init 213# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha 214# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit 215# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit 216# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak 217# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase 218# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric 219# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase 220# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart 221# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue 222# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase 223# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace 224# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit 225# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit 226# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase 227# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric 228# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase 229# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase 230 231#else 232 233# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 234# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 235# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject 236# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 237# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 238# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 239# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String 240# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 241# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 242# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 243# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 244# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 245# define PyUnicode_Compare PyUnicodeUCS4_Compare 246# define PyUnicode_Concat PyUnicodeUCS4_Concat 247# define PyUnicode_Append PyUnicodeUCS4_Append 248# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel 249# define PyUnicode_Contains PyUnicodeUCS4_Contains 250# define PyUnicode_Count PyUnicodeUCS4_Count 251# define PyUnicode_Decode PyUnicodeUCS4_Decode 252# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 253# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 254# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 255# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault 256# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize 257# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 258# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 259# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful 260# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 261# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful 262# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 263# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful 264# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 265# define PyUnicode_Encode PyUnicodeUCS4_Encode 266# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 267# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 268# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 269# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 270# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 271# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 272# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 273# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 274# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 275# define PyUnicode_Find PyUnicodeUCS4_Find 276# define PyUnicode_Format PyUnicodeUCS4_Format 277# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 278# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 279# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal 280# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 281# define PyUnicode_FromString PyUnicodeUCS4_FromString 282# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize 283# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV 284# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat 285# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 286# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 287# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 288# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 289# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier 290# define PyUnicode_Join PyUnicodeUCS4_Join 291# define PyUnicode_Partition PyUnicodeUCS4_Partition 292# define PyUnicode_RPartition PyUnicodeUCS4_RPartition 293# define PyUnicode_RSplit PyUnicodeUCS4_RSplit 294# define PyUnicode_Replace PyUnicodeUCS4_Replace 295# define PyUnicode_Resize PyUnicodeUCS4_Resize 296# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare 297# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding 298# define PyUnicode_Split PyUnicodeUCS4_Split 299# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 300# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 301# define PyUnicode_Translate PyUnicodeUCS4_Translate 302# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 303# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 304# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 305# define _PyUnicode_Init _PyUnicodeUCS4_Init 306# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha 307# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit 308# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit 309# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak 310# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase 311# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric 312# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase 313# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart 314# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue 315# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase 316# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace 317# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit 318# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit 319# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase 320# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric 321# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase 322# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase 323 324 325#endif 326 327/* --- Internal Unicode Operations ---------------------------------------- */ 328 329/* If you want Python to use the compiler's wctype.h functions instead 330 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or 331 configure Python using --with-wctype-functions. This reduces the 332 interpreter's code size. */ 333 334#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) 335 336#include <wctype.h> 337 338#define Py_UNICODE_ISSPACE(ch) iswspace(ch) 339 340#define Py_UNICODE_ISLOWER(ch) iswlower(ch) 341#define Py_UNICODE_ISUPPER(ch) iswupper(ch) 342#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 343#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 344 345#define Py_UNICODE_TOLOWER(ch) towlower(ch) 346#define Py_UNICODE_TOUPPER(ch) towupper(ch) 347#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 348 349#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 350#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 351#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 352 353#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 354#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 355#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 356 357#define Py_UNICODE_ISALPHA(ch) iswalpha(ch) 358 359#else 360 361/* Since splitting on whitespace is an important use case, and whitespace 362 in most situations is solely ASCII whitespace, we optimize for the common 363 case by using a quick look-up table with an inlined check. 364 */ 365extern const unsigned char _Py_ascii_whitespace[]; 366 367#define Py_UNICODE_ISSPACE(ch) \ 368 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 369 370#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 371#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 372#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 373#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 374 375#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 376#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 377#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 378 379#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 380#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 381#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 382 383#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 384#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 385#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 386 387#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 388 389#endif 390 391#define Py_UNICODE_ISALNUM(ch) \ 392 (Py_UNICODE_ISALPHA(ch) || \ 393 Py_UNICODE_ISDECIMAL(ch) || \ 394 Py_UNICODE_ISDIGIT(ch) || \ 395 Py_UNICODE_ISNUMERIC(ch)) 396 397#define Py_UNICODE_COPY(target, source, length) \ 398 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 399 400#define Py_UNICODE_FILL(target, value, length) do\ 401 {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 402 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 403 } while (0) 404 405/* check if substring matches at given offset. the offset must be 406 valid, and the substring must not be empty */ 407#define Py_UNICODE_MATCH(string, offset, substring) \ 408 ((*((string)->str + (offset)) == *((substring)->str)) && \ 409 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ 410 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) 411 412#ifdef __cplusplus 413extern "C" { 414#endif 415 416/* --- Unicode Type ------------------------------------------------------- */ 417 418typedef struct { 419 PyObject_HEAD 420 Py_ssize_t length; /* Length of raw Unicode data in buffer */ 421 Py_UNICODE *str; /* Raw Unicode buffer */ 422 long hash; /* Hash value; -1 if not set */ 423 int state; /* != 0 if interned. In this case the two 424 * references from the dictionary to this object 425 * are *not* counted in ob_refcnt. */ 426 PyObject *defenc; /* (Default) Encoded version as Python 427 string, or NULL; this is used for 428 implementing the buffer protocol */ 429} PyUnicodeObject; 430 431PyAPI_DATA(PyTypeObject) PyUnicode_Type; 432PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 433 434#define SSTATE_NOT_INTERNED 0 435#define SSTATE_INTERNED_MORTAL 1 436#define SSTATE_INTERNED_IMMORTAL 2 437 438#define PyUnicode_Check(op) \ 439 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 440#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 441 442/* Fast access macros */ 443#define PyUnicode_GET_SIZE(op) \ 444 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) 445#define PyUnicode_GET_DATA_SIZE(op) \ 446 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) 447#define PyUnicode_AS_UNICODE(op) \ 448 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) 449#define PyUnicode_AS_DATA(op) \ 450 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str)) 451 452/* --- Constants ---------------------------------------------------------- */ 453 454/* This Unicode character will be used as replacement character during 455 decoding if the errors argument is set to "replace". Note: the 456 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 457 Unicode 3.0. */ 458 459#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 460 461/* === Public API ========================================================= */ 462 463/* --- Plain Py_UNICODE --------------------------------------------------- */ 464 465/* Create a Unicode Object from the Py_UNICODE buffer u of the given 466 size. 467 468 u may be NULL which causes the contents to be undefined. It is the 469 user's responsibility to fill in the needed data afterwards. Note 470 that modifying the Unicode object contents after construction is 471 only allowed if u was set to NULL. 472 473 The buffer is copied into the new object. */ 474 475PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 476 const Py_UNICODE *u, /* Unicode buffer */ 477 Py_ssize_t size /* size of buffer */ 478 ); 479 480/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */ 481PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 482 const char *u, /* char buffer */ 483 Py_ssize_t size /* size of buffer */ 484 ); 485 486/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 487 Latin-1 encoded bytes */ 488PyAPI_FUNC(PyObject*) PyUnicode_FromString( 489 const char *u /* string */ 490 ); 491 492/* Return a read-only pointer to the Unicode object's internal 493 Py_UNICODE buffer. */ 494 495PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 496 PyObject *unicode /* Unicode object */ 497 ); 498 499/* Get the length of the Unicode object. */ 500 501PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 502 PyObject *unicode /* Unicode object */ 503 ); 504 505/* Get the maximum ordinal for a Unicode character. */ 506PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 507 508/* Resize an already allocated Unicode object to the new size length. 509 510 *unicode is modified to point to the new (resized) object and 0 511 returned on success. 512 513 This API may only be called by the function which also called the 514 Unicode constructor. The refcount on the object must be 1. Otherwise, 515 an error is returned. 516 517 Error handling is implemented as follows: an exception is set, -1 518 is returned and *unicode left untouched. 519 520*/ 521 522PyAPI_FUNC(int) PyUnicode_Resize( 523 PyObject **unicode, /* Pointer to the Unicode object */ 524 Py_ssize_t length /* New length */ 525 ); 526 527/* Coerce obj to an Unicode object and return a reference with 528 *incremented* refcount. 529 530 Coercion is done in the following way: 531 532 1. String and other char buffer compatible objects are decoded 533 under the assumptions that they contain data using the current 534 default encoding. Decoding is done in "strict" mode. 535 536 2. All other objects (including Unicode objects) raise an 537 exception. 538 539 The API returns NULL in case of an error. The caller is responsible 540 for decref'ing the returned objects. 541 542*/ 543 544PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 545 register PyObject *obj, /* Object */ 546 const char *encoding, /* encoding */ 547 const char *errors /* error handling */ 548 ); 549 550/* Coerce obj to an Unicode object and return a reference with 551 *incremented* refcount. 552 553 Unicode objects are passed back as-is (subclasses are converted to 554 true Unicode objects), all other objects are delegated to 555 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 556 using the default encoding as basis for decoding the object. 557 558 The API returns NULL in case of an error. The caller is responsible 559 for decref'ing the returned objects. 560 561*/ 562 563PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 564 register PyObject *obj /* Object */ 565 ); 566 567PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list); 568PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...); 569 570PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 571PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 572PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *); 573PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 574 575/* Use only if you know it's a string */ 576#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) 577 578/* --- wchar_t support for platforms which support it --------------------- */ 579 580#ifdef HAVE_WCHAR_H 581 582/* Create a Unicode Object from the whcar_t buffer w of the given 583 size. 584 585 The buffer is copied into the new object. */ 586 587PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 588 register const wchar_t *w, /* wchar_t buffer */ 589 Py_ssize_t size /* size of buffer */ 590 ); 591 592/* Copies the Unicode Object contents into the wchar_t buffer w. At 593 most size wchar_t characters are copied. 594 595 Note that the resulting wchar_t string may or may not be 596 0-terminated. It is the responsibility of the caller to make sure 597 that the wchar_t string is 0-terminated in case this is required by 598 the application. 599 600 Returns the number of wchar_t characters copied (excluding a 601 possibly trailing 0-termination character) or -1 in case of an 602 error. */ 603 604PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 605 PyUnicodeObject *unicode, /* Unicode object */ 606 register wchar_t *w, /* wchar_t buffer */ 607 Py_ssize_t size /* size of buffer */ 608 ); 609 610#endif 611 612/* --- Unicode ordinals --------------------------------------------------- */ 613 614/* Create a Unicode Object from the given Unicode code point ordinal. 615 616 The ordinal must be in range(0x10000) on narrow Python builds 617 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 618 raised in case it is not. 619 620*/ 621 622PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 623 624/* === Builtin Codecs ===================================================== 625 626 Many of these APIs take two arguments encoding and errors. These 627 parameters encoding and errors have the same semantics as the ones 628 of the builtin unicode() API. 629 630 Setting encoding to NULL causes the default encoding to be used. 631 632 Error handling is set by errors which may also be set to NULL 633 meaning to use the default handling defined for the codec. Default 634 error handling for all builtin codecs is "strict" (ValueErrors are 635 raised). 636 637 The codecs all use a similar interface. Only deviation from the 638 generic ones are documented. 639 640*/ 641 642/* --- Manage the default encoding ---------------------------------------- */ 643 644/* Return a Python string holding the default encoded value of the 645 Unicode object. 646 647 The resulting string is cached in the Unicode object for subsequent 648 usage by this function. The cached version is needed to implement 649 the character buffer interface and will live (at least) as long as 650 the Unicode object itself. 651 652 The refcount of the string is *not* incremented. 653 654 *** Exported for internal use by the interpreter only !!! *** 655 656*/ 657 658PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( 659 PyObject *, const char *); 660 661/* Decode a null-terminated string using Py_FileSystemDefaultEncoding. 662 663 If the encoding is supported by one of the built-in codecs (i.e., UTF-8, 664 UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace 665 invalid characters with '?'. 666 667 The function is intended to be used for paths and file names only 668 during bootstrapping process where the codecs are not set up. 669*/ 670 671PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 672 const char *s /* encoded string */ 673 ); 674 675PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 676 const char *s, /* encoded string */ 677 Py_ssize_t size /* size */ 678 ); 679 680 681/* Return a char* holding the UTF-8 encoded value of the 682 Unicode object. 683 684 DEPRECATED: use PyUnicode_AsStringAndSize() instead. 685*/ 686 687PyAPI_FUNC(char *) PyUnicode_AsStringAndSize(PyObject*, Py_ssize_t *); 688 689/* Returns the UTF-8 encoding, and its size. 690 691 If the output argument is NULL, no size is stored. 692 */ 693 694PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*); 695 696/* Returns the UTF-8 encoding. 697 698 This is equivalent to PyUnicode_AsStringAndSize(x, NULL). 699 700 */ 701 702PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 703 704/* Sets the currently active default encoding. 705 706 Returns 0 on success, -1 in case of an error. 707 708 */ 709 710PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding( 711 const char *encoding /* Encoding name in standard form */ 712 ); 713 714/* --- Generic Codecs ----------------------------------------------------- */ 715 716/* Create a Unicode object by decoding the encoded string s of the 717 given size. */ 718 719PyAPI_FUNC(PyObject*) PyUnicode_Decode( 720 const char *s, /* encoded string */ 721 Py_ssize_t size, /* size of buffer */ 722 const char *encoding, /* encoding */ 723 const char *errors /* error handling */ 724 ); 725 726/* Encodes a Py_UNICODE buffer of the given size and returns a 727 Python string object. */ 728 729PyAPI_FUNC(PyObject*) PyUnicode_Encode( 730 const Py_UNICODE *s, /* Unicode char buffer */ 731 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 732 const char *encoding, /* encoding */ 733 const char *errors /* error handling */ 734 ); 735 736/* Encodes a Unicode object and returns the result as Python 737 object. */ 738 739PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 740 PyObject *unicode, /* Unicode object */ 741 const char *encoding, /* encoding */ 742 const char *errors /* error handling */ 743 ); 744 745/* Encodes a Unicode object and returns the result as Python string 746 object. */ 747 748PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 749 PyObject *unicode, /* Unicode object */ 750 const char *encoding, /* encoding */ 751 const char *errors /* error handling */ 752 ); 753 754PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 755 PyObject* string /* 256 character map */ 756 ); 757 758 759/* --- UTF-7 Codecs ------------------------------------------------------- */ 760 761PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 762 const char *string, /* UTF-7 encoded string */ 763 Py_ssize_t length, /* size of string */ 764 const char *errors /* error handling */ 765 ); 766 767PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 768 const char *string, /* UTF-7 encoded string */ 769 Py_ssize_t length, /* size of string */ 770 const char *errors, /* error handling */ 771 Py_ssize_t *consumed /* bytes consumed */ 772 ); 773 774PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 775 const Py_UNICODE *data, /* Unicode char buffer */ 776 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 777 int encodeSetO, /* force the encoder to encode characters in 778 Set O, as described in RFC2152 */ 779 int encodeWhiteSpace, /* force the encoder to encode space, tab, 780 carriage return and linefeed characters */ 781 const char *errors /* error handling */ 782 ); 783 784/* --- UTF-8 Codecs ------------------------------------------------------- */ 785 786PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 787 const char *string, /* UTF-8 encoded string */ 788 Py_ssize_t length, /* size of string */ 789 const char *errors /* error handling */ 790 ); 791 792PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 793 const char *string, /* UTF-8 encoded string */ 794 Py_ssize_t length, /* size of string */ 795 const char *errors, /* error handling */ 796 Py_ssize_t *consumed /* bytes consumed */ 797 ); 798 799PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 800 PyObject *unicode /* Unicode object */ 801 ); 802 803PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 804 const Py_UNICODE *data, /* Unicode char buffer */ 805 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 806 const char *errors /* error handling */ 807 ); 808 809/* --- UTF-32 Codecs ------------------------------------------------------ */ 810 811/* Decodes length bytes from a UTF-32 encoded buffer string and returns 812 the corresponding Unicode object. 813 814 errors (if non-NULL) defines the error handling. It defaults 815 to "strict". 816 817 If byteorder is non-NULL, the decoder starts decoding using the 818 given byte order: 819 820 *byteorder == -1: little endian 821 *byteorder == 0: native order 822 *byteorder == 1: big endian 823 824 In native mode, the first four bytes of the stream are checked for a 825 BOM mark. If found, the BOM mark is analysed, the byte order 826 adjusted and the BOM skipped. In the other modes, no BOM mark 827 interpretation is done. After completion, *byteorder is set to the 828 current byte order at the end of input data. 829 830 If byteorder is NULL, the codec starts in native order mode. 831 832*/ 833 834PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 835 const char *string, /* UTF-32 encoded string */ 836 Py_ssize_t length, /* size of string */ 837 const char *errors, /* error handling */ 838 int *byteorder /* pointer to byteorder to use 839 0=native;-1=LE,1=BE; updated on 840 exit */ 841 ); 842 843PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 844 const char *string, /* UTF-32 encoded string */ 845 Py_ssize_t length, /* size of string */ 846 const char *errors, /* error handling */ 847 int *byteorder, /* pointer to byteorder to use 848 0=native;-1=LE,1=BE; updated on 849 exit */ 850 Py_ssize_t *consumed /* bytes consumed */ 851 ); 852 853/* Returns a Python string using the UTF-32 encoding in native byte 854 order. The string always starts with a BOM mark. */ 855 856PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 857 PyObject *unicode /* Unicode object */ 858 ); 859 860/* Returns a Python string object holding the UTF-32 encoded value of 861 the Unicode data. 862 863 If byteorder is not 0, output is written according to the following 864 byte order: 865 866 byteorder == -1: little endian 867 byteorder == 0: native byte order (writes a BOM mark) 868 byteorder == 1: big endian 869 870 If byteorder is 0, the output string will always start with the 871 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 872 prepended. 873 874*/ 875 876PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 877 const Py_UNICODE *data, /* Unicode char buffer */ 878 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 879 const char *errors, /* error handling */ 880 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 881 ); 882 883/* --- UTF-16 Codecs ------------------------------------------------------ */ 884 885/* Decodes length bytes from a UTF-16 encoded buffer string and returns 886 the corresponding Unicode object. 887 888 errors (if non-NULL) defines the error handling. It defaults 889 to "strict". 890 891 If byteorder is non-NULL, the decoder starts decoding using the 892 given byte order: 893 894 *byteorder == -1: little endian 895 *byteorder == 0: native order 896 *byteorder == 1: big endian 897 898 In native mode, the first two bytes of the stream are checked for a 899 BOM mark. If found, the BOM mark is analysed, the byte order 900 adjusted and the BOM skipped. In the other modes, no BOM mark 901 interpretation is done. After completion, *byteorder is set to the 902 current byte order at the end of input data. 903 904 If byteorder is NULL, the codec starts in native order mode. 905 906*/ 907 908PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 909 const char *string, /* UTF-16 encoded string */ 910 Py_ssize_t length, /* size of string */ 911 const char *errors, /* error handling */ 912 int *byteorder /* pointer to byteorder to use 913 0=native;-1=LE,1=BE; updated on 914 exit */ 915 ); 916 917PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 918 const char *string, /* UTF-16 encoded string */ 919 Py_ssize_t length, /* size of string */ 920 const char *errors, /* error handling */ 921 int *byteorder, /* pointer to byteorder to use 922 0=native;-1=LE,1=BE; updated on 923 exit */ 924 Py_ssize_t *consumed /* bytes consumed */ 925 ); 926 927/* Returns a Python string using the UTF-16 encoding in native byte 928 order. The string always starts with a BOM mark. */ 929 930PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 931 PyObject *unicode /* Unicode object */ 932 ); 933 934/* Returns a Python string object holding the UTF-16 encoded value of 935 the Unicode data. 936 937 If byteorder is not 0, output is written according to the following 938 byte order: 939 940 byteorder == -1: little endian 941 byteorder == 0: native byte order (writes a BOM mark) 942 byteorder == 1: big endian 943 944 If byteorder is 0, the output string will always start with the 945 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 946 prepended. 947 948 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 949 UCS-2. This trick makes it possible to add full UTF-16 capabilities 950 at a later point without compromising the APIs. 951 952*/ 953 954PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 955 const Py_UNICODE *data, /* Unicode char buffer */ 956 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 957 const char *errors, /* error handling */ 958 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 959 ); 960 961/* --- Unicode-Escape Codecs ---------------------------------------------- */ 962 963PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 964 const char *string, /* Unicode-Escape encoded string */ 965 Py_ssize_t length, /* size of string */ 966 const char *errors /* error handling */ 967 ); 968 969PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 970 PyObject *unicode /* Unicode object */ 971 ); 972 973PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 974 const Py_UNICODE *data, /* Unicode char buffer */ 975 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 976 ); 977 978/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 979 980PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 981 const char *string, /* Raw-Unicode-Escape encoded string */ 982 Py_ssize_t length, /* size of string */ 983 const char *errors /* error handling */ 984 ); 985 986PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 987 PyObject *unicode /* Unicode object */ 988 ); 989 990PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 991 const Py_UNICODE *data, /* Unicode char buffer */ 992 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 993 ); 994 995/* --- Unicode Internal Codec --------------------------------------------- 996 997 Only for internal use in _codecsmodule.c */ 998 999PyObject *_PyUnicode_DecodeUnicodeInternal( 1000 const char *string, 1001 Py_ssize_t length, 1002 const char *errors 1003 ); 1004 1005/* --- Latin-1 Codecs ----------------------------------------------------- 1006 1007 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1008 1009*/ 1010 1011PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1012 const char *string, /* Latin-1 encoded string */ 1013 Py_ssize_t length, /* size of string */ 1014 const char *errors /* error handling */ 1015 ); 1016 1017PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1018 PyObject *unicode /* Unicode object */ 1019 ); 1020 1021PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1022 const Py_UNICODE *data, /* Unicode char buffer */ 1023 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1024 const char *errors /* error handling */ 1025 ); 1026 1027/* --- ASCII Codecs ------------------------------------------------------- 1028 1029 Only 7-bit ASCII data is excepted. All other codes generate errors. 1030 1031*/ 1032 1033PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1034 const char *string, /* ASCII encoded string */ 1035 Py_ssize_t length, /* size of string */ 1036 const char *errors /* error handling */ 1037 ); 1038 1039PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1040 PyObject *unicode /* Unicode object */ 1041 ); 1042 1043PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1044 const Py_UNICODE *data, /* Unicode char buffer */ 1045 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1046 const char *errors /* error handling */ 1047 ); 1048 1049/* --- Character Map Codecs ----------------------------------------------- 1050 1051 This codec uses mappings to encode and decode characters. 1052 1053 Decoding mappings must map single string characters to single 1054 Unicode characters, integers (which are then interpreted as Unicode 1055 ordinals) or None (meaning "undefined mapping" and causing an 1056 error). 1057 1058 Encoding mappings must map single Unicode characters to single 1059 string characters, integers (which are then interpreted as Latin-1 1060 ordinals) or None (meaning "undefined mapping" and causing an 1061 error). 1062 1063 If a character lookup fails with a LookupError, the character is 1064 copied as-is meaning that its ordinal value will be interpreted as 1065 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1066 to contain those mappings which map characters to different code 1067 points. 1068 1069*/ 1070 1071PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1072 const char *string, /* Encoded string */ 1073 Py_ssize_t length, /* size of string */ 1074 PyObject *mapping, /* character mapping 1075 (char ordinal -> unicode ordinal) */ 1076 const char *errors /* error handling */ 1077 ); 1078 1079PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1080 PyObject *unicode, /* Unicode object */ 1081 PyObject *mapping /* character mapping 1082 (unicode ordinal -> char ordinal) */ 1083 ); 1084 1085PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1086 const Py_UNICODE *data, /* Unicode char buffer */ 1087 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1088 PyObject *mapping, /* character mapping 1089 (unicode ordinal -> char ordinal) */ 1090 const char *errors /* error handling */ 1091 ); 1092 1093/* Translate a Py_UNICODE buffer of the given length by applying a 1094 character mapping table to it and return the resulting Unicode 1095 object. 1096 1097 The mapping table must map Unicode ordinal integers to Unicode 1098 ordinal integers or None (causing deletion of the character). 1099 1100 Mapping tables may be dictionaries or sequences. Unmapped character 1101 ordinals (ones which cause a LookupError) are left untouched and 1102 are copied as-is. 1103 1104*/ 1105 1106PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1107 const Py_UNICODE *data, /* Unicode char buffer */ 1108 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1109 PyObject *table, /* Translate table */ 1110 const char *errors /* error handling */ 1111 ); 1112 1113#ifdef MS_WIN32 1114 1115/* --- MBCS codecs for Windows -------------------------------------------- */ 1116 1117PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1118 const char *string, /* MBCS encoded string */ 1119 Py_ssize_t length, /* size of string */ 1120 const char *errors /* error handling */ 1121 ); 1122 1123PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1124 const char *string, /* MBCS encoded string */ 1125 Py_ssize_t length, /* size of string */ 1126 const char *errors, /* error handling */ 1127 Py_ssize_t *consumed /* bytes consumed */ 1128 ); 1129 1130PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1131 PyObject *unicode /* Unicode object */ 1132 ); 1133 1134PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1135 const Py_UNICODE *data, /* Unicode char buffer */ 1136 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1137 const char *errors /* error handling */ 1138 ); 1139 1140#endif /* MS_WIN32 */ 1141 1142/* --- Decimal Encoder ---------------------------------------------------- */ 1143 1144/* Takes a Unicode string holding a decimal value and writes it into 1145 an output buffer using standard ASCII digit codes. 1146 1147 The output buffer has to provide at least length+1 bytes of storage 1148 area. The output string is 0-terminated. 1149 1150 The encoder converts whitespace to ' ', decimal characters to their 1151 corresponding ASCII digit and all other Latin-1 characters except 1152 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1153 are treated as errors. This includes embedded NULL bytes. 1154 1155 Error handling is defined by the errors argument: 1156 1157 NULL or "strict": raise a ValueError 1158 "ignore": ignore the wrong characters (these are not copied to the 1159 output buffer) 1160 "replace": replaces illegal characters with '?' 1161 1162 Returns 0 on success, -1 on failure. 1163 1164*/ 1165 1166PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1167 Py_UNICODE *s, /* Unicode buffer */ 1168 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1169 char *output, /* Output buffer; must have size >= length */ 1170 const char *errors /* error handling */ 1171 ); 1172 1173/* --- Methods & Slots ---------------------------------------------------- 1174 1175 These are capable of handling Unicode objects and strings on input 1176 (we refer to them as strings in the descriptions) and return 1177 Unicode objects or integers as apporpriate. */ 1178 1179/* Concat two strings giving a new Unicode string. */ 1180 1181PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1182 PyObject *left, /* Left string */ 1183 PyObject *right /* Right string */ 1184 ); 1185 1186/* Concat two strings and put the result in *pleft 1187 (sets *pleft to NULL on error) */ 1188 1189PyAPI_FUNC(void) PyUnicode_Append( 1190 PyObject **pleft, /* Pointer to left string */ 1191 PyObject *right /* Right string */ 1192 ); 1193 1194/* Concat two strings, put the result in *pleft and drop the right object 1195 (sets *pleft to NULL on error) */ 1196 1197PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1198 PyObject **pleft, /* Pointer to left string */ 1199 PyObject *right /* Right string */ 1200 ); 1201 1202/* Split a string giving a list of Unicode strings. 1203 1204 If sep is NULL, splitting will be done at all whitespace 1205 substrings. Otherwise, splits occur at the given separator. 1206 1207 At most maxsplit splits will be done. If negative, no limit is set. 1208 1209 Separators are not included in the resulting list. 1210 1211*/ 1212 1213PyAPI_FUNC(PyObject*) PyUnicode_Split( 1214 PyObject *s, /* String to split */ 1215 PyObject *sep, /* String separator */ 1216 Py_ssize_t maxsplit /* Maxsplit count */ 1217 ); 1218 1219/* Dito, but split at line breaks. 1220 1221 CRLF is considered to be one line break. Line breaks are not 1222 included in the resulting list. */ 1223 1224PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1225 PyObject *s, /* String to split */ 1226 int keepends /* If true, line end markers are included */ 1227 ); 1228 1229/* Partition a string using a given separator. */ 1230 1231PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1232 PyObject *s, /* String to partition */ 1233 PyObject *sep /* String separator */ 1234 ); 1235 1236/* Partition a string using a given separator, searching from the end of the 1237 string. */ 1238 1239PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1240 PyObject *s, /* String to partition */ 1241 PyObject *sep /* String separator */ 1242 ); 1243 1244/* Split a string giving a list of Unicode strings. 1245 1246 If sep is NULL, splitting will be done at all whitespace 1247 substrings. Otherwise, splits occur at the given separator. 1248 1249 At most maxsplit splits will be done. But unlike PyUnicode_Split 1250 PyUnicode_RSplit splits from the end of the string. If negative, 1251 no limit is set. 1252 1253 Separators are not included in the resulting list. 1254 1255*/ 1256 1257PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1258 PyObject *s, /* String to split */ 1259 PyObject *sep, /* String separator */ 1260 Py_ssize_t maxsplit /* Maxsplit count */ 1261 ); 1262 1263/* Translate a string by applying a character mapping table to it and 1264 return the resulting Unicode object. 1265 1266 The mapping table must map Unicode ordinal integers to Unicode 1267 ordinal integers or None (causing deletion of the character). 1268 1269 Mapping tables may be dictionaries or sequences. Unmapped character 1270 ordinals (ones which cause a LookupError) are left untouched and 1271 are copied as-is. 1272 1273*/ 1274 1275PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1276 PyObject *str, /* String */ 1277 PyObject *table, /* Translate table */ 1278 const char *errors /* error handling */ 1279 ); 1280 1281/* Join a sequence of strings using the given separator and return 1282 the resulting Unicode string. */ 1283 1284PyAPI_FUNC(PyObject*) PyUnicode_Join( 1285 PyObject *separator, /* Separator string */ 1286 PyObject *seq /* Sequence object */ 1287 ); 1288 1289/* Return 1 if substr matches str[start:end] at the given tail end, 0 1290 otherwise. */ 1291 1292PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1293 PyObject *str, /* String */ 1294 PyObject *substr, /* Prefix or Suffix string */ 1295 Py_ssize_t start, /* Start index */ 1296 Py_ssize_t end, /* Stop index */ 1297 int direction /* Tail end: -1 prefix, +1 suffix */ 1298 ); 1299 1300/* Return the first position of substr in str[start:end] using the 1301 given search direction or -1 if not found. -2 is returned in case 1302 an error occurred and an exception is set. */ 1303 1304PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1305 PyObject *str, /* String */ 1306 PyObject *substr, /* Substring to find */ 1307 Py_ssize_t start, /* Start index */ 1308 Py_ssize_t end, /* Stop index */ 1309 int direction /* Find direction: +1 forward, -1 backward */ 1310 ); 1311 1312/* Count the number of occurrences of substr in str[start:end]. */ 1313 1314PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1315 PyObject *str, /* String */ 1316 PyObject *substr, /* Substring to count */ 1317 Py_ssize_t start, /* Start index */ 1318 Py_ssize_t end /* Stop index */ 1319 ); 1320 1321/* Replace at most maxcount occurrences of substr in str with replstr 1322 and return the resulting Unicode object. */ 1323 1324PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1325 PyObject *str, /* String */ 1326 PyObject *substr, /* Substring to find */ 1327 PyObject *replstr, /* Substring to replace */ 1328 Py_ssize_t maxcount /* Max. number of replacements to apply; 1329 -1 = all */ 1330 ); 1331 1332/* Compare two strings and return -1, 0, 1 for less than, equal, 1333 greater than resp. */ 1334 1335PyAPI_FUNC(int) PyUnicode_Compare( 1336 PyObject *left, /* Left string */ 1337 PyObject *right /* Right string */ 1338 ); 1339 1340PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1341 PyObject *left, 1342 const char *right 1343 ); 1344 1345/* Rich compare two strings and return one of the following: 1346 1347 - NULL in case an exception was raised 1348 - Py_True or Py_False for successfuly comparisons 1349 - Py_NotImplemented in case the type combination is unknown 1350 1351 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1352 case the conversion of the arguments to Unicode fails with a 1353 UnicodeDecodeError. 1354 1355 Possible values for op: 1356 1357 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1358 1359*/ 1360 1361PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1362 PyObject *left, /* Left string */ 1363 PyObject *right, /* Right string */ 1364 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1365 ); 1366 1367/* Apply a argument tuple or dictionary to a format string and return 1368 the resulting Unicode string. */ 1369 1370PyAPI_FUNC(PyObject *) PyUnicode_Format( 1371 PyObject *format, /* Format string */ 1372 PyObject *args /* Argument tuple or dictionary */ 1373 ); 1374 1375/* Checks whether element is contained in container and return 1/0 1376 accordingly. 1377 1378 element has to coerce to an one element Unicode string. -1 is 1379 returned in case of an error. */ 1380 1381PyAPI_FUNC(int) PyUnicode_Contains( 1382 PyObject *container, /* Container string */ 1383 PyObject *element /* Element string */ 1384 ); 1385 1386/* Checks whether argument is a valid identifier. */ 1387 1388PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1389 1390/* Externally visible for str.strip(unicode) */ 1391PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1392 PyUnicodeObject *self, 1393 int striptype, 1394 PyObject *sepobj 1395 ); 1396 1397/* === Characters Type APIs =============================================== */ 1398 1399/* These should not be used directly. Use the Py_UNICODE_IS* and 1400 Py_UNICODE_TO* macros instead. 1401 1402 These APIs are implemented in Objects/unicodectype.c. 1403 1404*/ 1405 1406PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1407 Py_UNICODE ch /* Unicode character */ 1408 ); 1409 1410PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1411 Py_UNICODE ch /* Unicode character */ 1412 ); 1413 1414PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1415 Py_UNICODE ch /* Unicode character */ 1416 ); 1417 1418PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1419 Py_UNICODE ch /* Unicode character */ 1420 ); 1421 1422PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1423 Py_UNICODE ch /* Unicode character */ 1424 ); 1425 1426PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1427 const Py_UNICODE ch /* Unicode character */ 1428 ); 1429 1430PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1431 const Py_UNICODE ch /* Unicode character */ 1432 ); 1433 1434PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase( 1435 Py_UNICODE ch /* Unicode character */ 1436 ); 1437 1438PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase( 1439 Py_UNICODE ch /* Unicode character */ 1440 ); 1441 1442PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase( 1443 Py_UNICODE ch /* Unicode character */ 1444 ); 1445 1446PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1447 Py_UNICODE ch /* Unicode character */ 1448 ); 1449 1450PyAPI_FUNC(int) _PyUnicode_ToDigit( 1451 Py_UNICODE ch /* Unicode character */ 1452 ); 1453 1454PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1455 Py_UNICODE ch /* Unicode character */ 1456 ); 1457 1458PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1459 Py_UNICODE ch /* Unicode character */ 1460 ); 1461 1462PyAPI_FUNC(int) _PyUnicode_IsDigit( 1463 Py_UNICODE ch /* Unicode character */ 1464 ); 1465 1466PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1467 Py_UNICODE ch /* Unicode character */ 1468 ); 1469 1470PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1471 Py_UNICODE ch /* Unicode character */ 1472 ); 1473 1474PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u); 1475 1476PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1477 Py_UNICODE *s1, const Py_UNICODE *s2); 1478 1479PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1480 Py_UNICODE *s1, const Py_UNICODE *s2, size_t n); 1481 1482PyAPI_FUNC(int) Py_UNICODE_strcmp( 1483 const Py_UNICODE *s1, const Py_UNICODE *s2); 1484 1485PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1486 const Py_UNICODE *s, Py_UNICODE c 1487 ); 1488 1489#ifdef __cplusplus 1490} 1491#endif 1492#endif /* !Py_UNICODEOBJECT_H */ 1493