unicodeobject.h revision a156e09b19cd239176a9316ddca7641784eea99e
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4#include <stdarg.h> 5 6/* 7 8Unicode implementation based on original code by Fredrik Lundh, 9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10Unicode Integration Proposal (see file Misc/unicode.txt). 11 12Copyright (c) Corporation for National Research Initiatives. 13 14 15 Original header: 16 -------------------------------------------------------------------- 17 18 * Yet another Unicode string type for Python. This type supports the 19 * 16-bit Basic Multilingual Plane (BMP) only. 20 * 21 * Written by Fredrik Lundh, January 1999. 22 * 23 * Copyright (c) 1999 by Secret Labs AB. 24 * Copyright (c) 1999 by Fredrik Lundh. 25 * 26 * fredrik@pythonware.com 27 * http://www.pythonware.com 28 * 29 * -------------------------------------------------------------------- 30 * This Unicode String Type is 31 * 32 * Copyright (c) 1999 by Secret Labs AB 33 * Copyright (c) 1999 by Fredrik Lundh 34 * 35 * By obtaining, using, and/or copying this software and/or its 36 * associated documentation, you agree that you have read, understood, 37 * and will comply with the following terms and conditions: 38 * 39 * Permission to use, copy, modify, and distribute this software and its 40 * associated documentation for any purpose and without fee is hereby 41 * granted, provided that the above copyright notice appears in all 42 * copies, and that both that copyright notice and this permission notice 43 * appear in supporting documentation, and that the name of Secret Labs 44 * AB or the author not be used in advertising or publicity pertaining to 45 * distribution of the software without specific, written prior 46 * permission. 47 * 48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 50 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 55 * -------------------------------------------------------------------- */ 56 57#include <ctype.h> 58 59/* === Internal API ======================================================= */ 60 61/* --- Internal Unicode Format -------------------------------------------- */ 62 63/* Python 3.x requires unicode */ 64#define Py_USING_UNICODE 65 66/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 67 properly set, but the default rules below doesn't set it. I'll 68 sort this out some other day -- fredrik@pythonware.com */ 69 70#ifndef Py_UNICODE_SIZE 71#error Must define Py_UNICODE_SIZE 72#endif 73 74/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 75 strings are stored as UCS-2 (with limited support for UTF-16) */ 76 77#if Py_UNICODE_SIZE >= 4 78#define Py_UNICODE_WIDE 79#endif 80 81/* Set these flags if the platform has "wchar.h", "wctype.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83/* #define HAVE_WCHAR_H */ 84/* #define HAVE_USABLE_WCHAR_T */ 85 86/* Defaults for various platforms */ 87#ifndef PY_UNICODE_TYPE 88 89/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 90# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 91# define HAVE_USABLE_WCHAR_T 92# define PY_UNICODE_TYPE wchar_t 93# endif 94 95# if defined(Py_UNICODE_WIDE) 96# define PY_UNICODE_TYPE Py_UCS4 97# endif 98 99#endif 100 101/* If the compiler provides a wchar_t type we try to support it 102 through the interface functions PyUnicode_FromWideChar() and 103 PyUnicode_AsWideChar(). */ 104 105#ifdef HAVE_USABLE_WCHAR_T 106# ifndef HAVE_WCHAR_H 107# define HAVE_WCHAR_H 108# endif 109#endif 110 111#ifdef HAVE_WCHAR_H 112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 113# ifdef _HAVE_BSDI 114# include <time.h> 115# endif 116# include <wchar.h> 117#endif 118 119/* 120 * Use this typedef when you need to represent a UTF-16 surrogate pair 121 * as single unsigned integer. 122 */ 123#if SIZEOF_INT >= 4 124typedef unsigned int Py_UCS4; 125#elif SIZEOF_LONG >= 4 126typedef unsigned long Py_UCS4; 127#endif 128 129typedef PY_UNICODE_TYPE Py_UNICODE; 130 131/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 132 133/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 134 produce different external names and thus cause import errors in 135 case Python interpreters and extensions with mixed compiled in 136 Unicode width assumptions are combined. */ 137 138#ifndef Py_UNICODE_WIDE 139 140# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 141# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 142# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject 143# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 144# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 145# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 146# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String 147# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 148# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 149# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 150# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 151# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 152# define PyUnicode_Compare PyUnicodeUCS2_Compare 153# define PyUnicode_Concat PyUnicodeUCS2_Concat 154# define PyUnicode_Append PyUnicodeUCS2_Append 155# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel 156# define PyUnicode_Contains PyUnicodeUCS2_Contains 157# define PyUnicode_Count PyUnicodeUCS2_Count 158# define PyUnicode_Decode PyUnicodeUCS2_Decode 159# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 160# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 161# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 162# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault 163# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize 164# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 165# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 166# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful 167# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 168# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful 169# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 170# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful 171# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 172# define PyUnicode_Encode PyUnicodeUCS2_Encode 173# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 174# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 175# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 176# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 177# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 178# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 179# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 180# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 181# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 182# define PyUnicode_Find PyUnicodeUCS2_Find 183# define PyUnicode_Format PyUnicodeUCS2_Format 184# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 185# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 186# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal 187# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 188# define PyUnicode_FromString PyUnicodeUCS2_FromString 189# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize 190# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV 191# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat 192# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 193# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 194# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 195# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 196# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier 197# define PyUnicode_Join PyUnicodeUCS2_Join 198# define PyUnicode_Partition PyUnicodeUCS2_Partition 199# define PyUnicode_RPartition PyUnicodeUCS2_RPartition 200# define PyUnicode_RSplit PyUnicodeUCS2_RSplit 201# define PyUnicode_Replace PyUnicodeUCS2_Replace 202# define PyUnicode_Resize PyUnicodeUCS2_Resize 203# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare 204# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding 205# define PyUnicode_Split PyUnicodeUCS2_Split 206# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 207# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 208# define PyUnicode_Translate PyUnicodeUCS2_Translate 209# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 210# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 211# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 212# define _PyUnicode_Init _PyUnicodeUCS2_Init 213# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist 214# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha 215# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit 216# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit 217# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak 218# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase 219# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric 220# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase 221# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart 222# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue 223# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase 224# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace 225# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit 226# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit 227# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase 228# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric 229# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase 230# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase 231 232#else 233 234# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 235# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 236# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject 237# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 238# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 239# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 240# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String 241# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 242# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 243# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 244# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 245# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 246# define PyUnicode_Compare PyUnicodeUCS4_Compare 247# define PyUnicode_Concat PyUnicodeUCS4_Concat 248# define PyUnicode_Append PyUnicodeUCS4_Append 249# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel 250# define PyUnicode_Contains PyUnicodeUCS4_Contains 251# define PyUnicode_Count PyUnicodeUCS4_Count 252# define PyUnicode_Decode PyUnicodeUCS4_Decode 253# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 254# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 255# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 256# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault 257# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize 258# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 259# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 260# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful 261# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 262# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful 263# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 264# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful 265# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 266# define PyUnicode_Encode PyUnicodeUCS4_Encode 267# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 268# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 269# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 270# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 271# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 272# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 273# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 274# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 275# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 276# define PyUnicode_Find PyUnicodeUCS4_Find 277# define PyUnicode_Format PyUnicodeUCS4_Format 278# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 279# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 280# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal 281# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 282# define PyUnicode_FromString PyUnicodeUCS4_FromString 283# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize 284# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV 285# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat 286# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 287# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 288# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 289# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 290# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier 291# define PyUnicode_Join PyUnicodeUCS4_Join 292# define PyUnicode_Partition PyUnicodeUCS4_Partition 293# define PyUnicode_RPartition PyUnicodeUCS4_RPartition 294# define PyUnicode_RSplit PyUnicodeUCS4_RSplit 295# define PyUnicode_Replace PyUnicodeUCS4_Replace 296# define PyUnicode_Resize PyUnicodeUCS4_Resize 297# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare 298# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding 299# define PyUnicode_Split PyUnicodeUCS4_Split 300# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 301# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 302# define PyUnicode_Translate PyUnicodeUCS4_Translate 303# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 304# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 305# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 306# define _PyUnicode_Init _PyUnicodeUCS4_Init 307# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist 308# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha 309# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit 310# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit 311# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak 312# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase 313# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric 314# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase 315# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart 316# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue 317# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase 318# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace 319# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit 320# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit 321# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase 322# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric 323# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase 324# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase 325 326 327#endif 328 329/* --- Internal Unicode Operations ---------------------------------------- */ 330 331/* If you want Python to use the compiler's wctype.h functions instead 332 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or 333 configure Python using --with-wctype-functions. This reduces the 334 interpreter's code size. */ 335 336#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) 337 338#include <wctype.h> 339 340#define Py_UNICODE_ISSPACE(ch) iswspace(ch) 341 342#define Py_UNICODE_ISLOWER(ch) iswlower(ch) 343#define Py_UNICODE_ISUPPER(ch) iswupper(ch) 344#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 345#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 346 347#define Py_UNICODE_TOLOWER(ch) towlower(ch) 348#define Py_UNICODE_TOUPPER(ch) towupper(ch) 349#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 350 351#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 352#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 353#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 354 355#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 356#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 357#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 358 359#define Py_UNICODE_ISALPHA(ch) iswalpha(ch) 360 361#else 362 363/* Since splitting on whitespace is an important use case, and whitespace 364 in most situations is solely ASCII whitespace, we optimize for the common 365 case by using a quick look-up table with an inlined check. 366 */ 367extern const unsigned char _Py_ascii_whitespace[]; 368 369#define Py_UNICODE_ISSPACE(ch) \ 370 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 371 372#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 373#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 374#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 375#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 376 377#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 378#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 379#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 380 381#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 382#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 383#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 384 385#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 386#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 387#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 388 389#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 390 391#endif 392 393#define Py_UNICODE_ISALNUM(ch) \ 394 (Py_UNICODE_ISALPHA(ch) || \ 395 Py_UNICODE_ISDECIMAL(ch) || \ 396 Py_UNICODE_ISDIGIT(ch) || \ 397 Py_UNICODE_ISNUMERIC(ch)) 398 399#define Py_UNICODE_COPY(target, source, length) \ 400 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 401 402#define Py_UNICODE_FILL(target, value, length) do\ 403 {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 404 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 405 } while (0) 406 407/* check if substring matches at given offset. the offset must be 408 valid, and the substring must not be empty */ 409#define Py_UNICODE_MATCH(string, offset, substring) \ 410 ((*((string)->str + (offset)) == *((substring)->str)) && \ 411 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ 412 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) 413 414#ifdef __cplusplus 415extern "C" { 416#endif 417 418PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 419 420/* --- Unicode Type ------------------------------------------------------- */ 421 422typedef struct { 423 PyObject_HEAD 424 Py_ssize_t length; /* Length of raw Unicode data in buffer */ 425 Py_UNICODE *str; /* Raw Unicode buffer */ 426 long hash; /* Hash value; -1 if not set */ 427 int state; /* != 0 if interned. In this case the two 428 * references from the dictionary to this object 429 * are *not* counted in ob_refcnt. */ 430 PyObject *defenc; /* (Default) Encoded version as Python 431 string, or NULL; this is used for 432 implementing the buffer protocol */ 433} PyUnicodeObject; 434 435PyAPI_DATA(PyTypeObject) PyUnicode_Type; 436PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 437 438#define SSTATE_NOT_INTERNED 0 439#define SSTATE_INTERNED_MORTAL 1 440#define SSTATE_INTERNED_IMMORTAL 2 441 442#define PyUnicode_Check(op) \ 443 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 444#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 445 446/* Fast access macros */ 447#define PyUnicode_GET_SIZE(op) \ 448 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) 449#define PyUnicode_GET_DATA_SIZE(op) \ 450 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) 451#define PyUnicode_AS_UNICODE(op) \ 452 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) 453#define PyUnicode_AS_DATA(op) \ 454 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str)) 455 456/* --- Constants ---------------------------------------------------------- */ 457 458/* This Unicode character will be used as replacement character during 459 decoding if the errors argument is set to "replace". Note: the 460 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 461 Unicode 3.0. */ 462 463#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 464 465/* === Public API ========================================================= */ 466 467/* --- Plain Py_UNICODE --------------------------------------------------- */ 468 469/* Create a Unicode Object from the Py_UNICODE buffer u of the given 470 size. 471 472 u may be NULL which causes the contents to be undefined. It is the 473 user's responsibility to fill in the needed data afterwards. Note 474 that modifying the Unicode object contents after construction is 475 only allowed if u was set to NULL. 476 477 The buffer is copied into the new object. */ 478 479PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 480 const Py_UNICODE *u, /* Unicode buffer */ 481 Py_ssize_t size /* size of buffer */ 482 ); 483 484/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */ 485PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 486 const char *u, /* char buffer */ 487 Py_ssize_t size /* size of buffer */ 488 ); 489 490/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 491 Latin-1 encoded bytes */ 492PyAPI_FUNC(PyObject*) PyUnicode_FromString( 493 const char *u /* string */ 494 ); 495 496/* Return a read-only pointer to the Unicode object's internal 497 Py_UNICODE buffer. */ 498 499PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 500 PyObject *unicode /* Unicode object */ 501 ); 502 503/* Get the length of the Unicode object. */ 504 505PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 506 PyObject *unicode /* Unicode object */ 507 ); 508 509/* Get the maximum ordinal for a Unicode character. */ 510PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 511 512/* Resize an already allocated Unicode object to the new size length. 513 514 *unicode is modified to point to the new (resized) object and 0 515 returned on success. 516 517 This API may only be called by the function which also called the 518 Unicode constructor. The refcount on the object must be 1. Otherwise, 519 an error is returned. 520 521 Error handling is implemented as follows: an exception is set, -1 522 is returned and *unicode left untouched. 523 524*/ 525 526PyAPI_FUNC(int) PyUnicode_Resize( 527 PyObject **unicode, /* Pointer to the Unicode object */ 528 Py_ssize_t length /* New length */ 529 ); 530 531/* Coerce obj to an Unicode object and return a reference with 532 *incremented* refcount. 533 534 Coercion is done in the following way: 535 536 1. String and other char buffer compatible objects are decoded 537 under the assumptions that they contain data using the current 538 default encoding. Decoding is done in "strict" mode. 539 540 2. All other objects (including Unicode objects) raise an 541 exception. 542 543 The API returns NULL in case of an error. The caller is responsible 544 for decref'ing the returned objects. 545 546*/ 547 548PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 549 register PyObject *obj, /* Object */ 550 const char *encoding, /* encoding */ 551 const char *errors /* error handling */ 552 ); 553 554/* Coerce obj to an Unicode object and return a reference with 555 *incremented* refcount. 556 557 Unicode objects are passed back as-is (subclasses are converted to 558 true Unicode objects), all other objects are delegated to 559 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 560 using the default encoding as basis for decoding the object. 561 562 The API returns NULL in case of an error. The caller is responsible 563 for decref'ing the returned objects. 564 565*/ 566 567PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 568 register PyObject *obj /* Object */ 569 ); 570 571PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list); 572PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...); 573 574PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 575PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 576PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *); 577PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 578 579/* Use only if you know it's a string */ 580#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) 581 582/* --- wchar_t support for platforms which support it --------------------- */ 583 584#ifdef HAVE_WCHAR_H 585 586/* Create a Unicode Object from the whcar_t buffer w of the given 587 size. 588 589 The buffer is copied into the new object. */ 590 591PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 592 register const wchar_t *w, /* wchar_t buffer */ 593 Py_ssize_t size /* size of buffer */ 594 ); 595 596/* Copies the Unicode Object contents into the wchar_t buffer w. At 597 most size wchar_t characters are copied. 598 599 Note that the resulting wchar_t string may or may not be 600 0-terminated. It is the responsibility of the caller to make sure 601 that the wchar_t string is 0-terminated in case this is required by 602 the application. 603 604 Returns the number of wchar_t characters copied (excluding a 605 possibly trailing 0-termination character) or -1 in case of an 606 error. */ 607 608PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 609 PyUnicodeObject *unicode, /* Unicode object */ 610 register wchar_t *w, /* wchar_t buffer */ 611 Py_ssize_t size /* size of buffer */ 612 ); 613 614#endif 615 616/* --- Unicode ordinals --------------------------------------------------- */ 617 618/* Create a Unicode Object from the given Unicode code point ordinal. 619 620 The ordinal must be in range(0x10000) on narrow Python builds 621 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 622 raised in case it is not. 623 624*/ 625 626PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 627 628/* === Builtin Codecs ===================================================== 629 630 Many of these APIs take two arguments encoding and errors. These 631 parameters encoding and errors have the same semantics as the ones 632 of the builtin unicode() API. 633 634 Setting encoding to NULL causes the default encoding to be used. 635 636 Error handling is set by errors which may also be set to NULL 637 meaning to use the default handling defined for the codec. Default 638 error handling for all builtin codecs is "strict" (ValueErrors are 639 raised). 640 641 The codecs all use a similar interface. Only deviation from the 642 generic ones are documented. 643 644*/ 645 646/* --- Manage the default encoding ---------------------------------------- */ 647 648/* Return a Python string holding the default encoded value of the 649 Unicode object. 650 651 The resulting string is cached in the Unicode object for subsequent 652 usage by this function. The cached version is needed to implement 653 the character buffer interface and will live (at least) as long as 654 the Unicode object itself. 655 656 The refcount of the string is *not* incremented. 657 658 *** Exported for internal use by the interpreter only !!! *** 659 660*/ 661 662PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( 663 PyObject *, const char *); 664 665/* Decode a null-terminated string using Py_FileSystemDefaultEncoding. 666 667 If the encoding is supported by one of the built-in codecs (i.e., UTF-8, 668 UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace 669 invalid characters with '?'. 670 671 The function is intended to be used for paths and file names only 672 during bootstrapping process where the codecs are not set up. 673*/ 674 675PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 676 const char *s /* encoded string */ 677 ); 678 679PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 680 const char *s, /* encoded string */ 681 Py_ssize_t size /* size */ 682 ); 683 684 685/* Return a char* holding the UTF-8 encoded value of the 686 Unicode object. 687 688 DEPRECATED: use PyUnicode_AsStringAndSize() instead. 689*/ 690 691PyAPI_FUNC(char *) PyUnicode_AsStringAndSize(PyObject*, Py_ssize_t *); 692 693/* Returns the UTF-8 encoding, and its size. 694 695 If the output argument is NULL, no size is stored. 696 */ 697 698PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*); 699 700/* Returns the UTF-8 encoding. 701 702 This is equivalent to PyUnicode_AsStringAndSize(x, NULL). 703 704 */ 705 706PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 707 708/* Sets the currently active default encoding. 709 710 Returns 0 on success, -1 in case of an error. 711 712 */ 713 714PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding( 715 const char *encoding /* Encoding name in standard form */ 716 ); 717 718/* --- Generic Codecs ----------------------------------------------------- */ 719 720/* Create a Unicode object by decoding the encoded string s of the 721 given size. */ 722 723PyAPI_FUNC(PyObject*) PyUnicode_Decode( 724 const char *s, /* encoded string */ 725 Py_ssize_t size, /* size of buffer */ 726 const char *encoding, /* encoding */ 727 const char *errors /* error handling */ 728 ); 729 730/* Encodes a Py_UNICODE buffer of the given size and returns a 731 Python string object. */ 732 733PyAPI_FUNC(PyObject*) PyUnicode_Encode( 734 const Py_UNICODE *s, /* Unicode char buffer */ 735 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 736 const char *encoding, /* encoding */ 737 const char *errors /* error handling */ 738 ); 739 740/* Encodes a Unicode object and returns the result as Python 741 object. */ 742 743PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 744 PyObject *unicode, /* Unicode object */ 745 const char *encoding, /* encoding */ 746 const char *errors /* error handling */ 747 ); 748 749/* Encodes a Unicode object and returns the result as Python string 750 object. */ 751 752PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 753 PyObject *unicode, /* Unicode object */ 754 const char *encoding, /* encoding */ 755 const char *errors /* error handling */ 756 ); 757 758PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 759 PyObject* string /* 256 character map */ 760 ); 761 762 763/* --- UTF-7 Codecs ------------------------------------------------------- */ 764 765PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 766 const char *string, /* UTF-7 encoded string */ 767 Py_ssize_t length, /* size of string */ 768 const char *errors /* error handling */ 769 ); 770 771PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 772 const char *string, /* UTF-7 encoded string */ 773 Py_ssize_t length, /* size of string */ 774 const char *errors, /* error handling */ 775 Py_ssize_t *consumed /* bytes consumed */ 776 ); 777 778PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 779 const Py_UNICODE *data, /* Unicode char buffer */ 780 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 781 int encodeSetO, /* force the encoder to encode characters in 782 Set O, as described in RFC2152 */ 783 int encodeWhiteSpace, /* force the encoder to encode space, tab, 784 carriage return and linefeed characters */ 785 const char *errors /* error handling */ 786 ); 787 788/* --- UTF-8 Codecs ------------------------------------------------------- */ 789 790PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 791 const char *string, /* UTF-8 encoded string */ 792 Py_ssize_t length, /* size of string */ 793 const char *errors /* error handling */ 794 ); 795 796PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 797 const char *string, /* UTF-8 encoded string */ 798 Py_ssize_t length, /* size of string */ 799 const char *errors, /* error handling */ 800 Py_ssize_t *consumed /* bytes consumed */ 801 ); 802 803PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 804 PyObject *unicode /* Unicode object */ 805 ); 806 807PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 808 const Py_UNICODE *data, /* Unicode char buffer */ 809 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 810 const char *errors /* error handling */ 811 ); 812 813/* --- UTF-32 Codecs ------------------------------------------------------ */ 814 815/* Decodes length bytes from a UTF-32 encoded buffer string and returns 816 the corresponding Unicode object. 817 818 errors (if non-NULL) defines the error handling. It defaults 819 to "strict". 820 821 If byteorder is non-NULL, the decoder starts decoding using the 822 given byte order: 823 824 *byteorder == -1: little endian 825 *byteorder == 0: native order 826 *byteorder == 1: big endian 827 828 In native mode, the first four bytes of the stream are checked for a 829 BOM mark. If found, the BOM mark is analysed, the byte order 830 adjusted and the BOM skipped. In the other modes, no BOM mark 831 interpretation is done. After completion, *byteorder is set to the 832 current byte order at the end of input data. 833 834 If byteorder is NULL, the codec starts in native order mode. 835 836*/ 837 838PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 839 const char *string, /* UTF-32 encoded string */ 840 Py_ssize_t length, /* size of string */ 841 const char *errors, /* error handling */ 842 int *byteorder /* pointer to byteorder to use 843 0=native;-1=LE,1=BE; updated on 844 exit */ 845 ); 846 847PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 848 const char *string, /* UTF-32 encoded string */ 849 Py_ssize_t length, /* size of string */ 850 const char *errors, /* error handling */ 851 int *byteorder, /* pointer to byteorder to use 852 0=native;-1=LE,1=BE; updated on 853 exit */ 854 Py_ssize_t *consumed /* bytes consumed */ 855 ); 856 857/* Returns a Python string using the UTF-32 encoding in native byte 858 order. The string always starts with a BOM mark. */ 859 860PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 861 PyObject *unicode /* Unicode object */ 862 ); 863 864/* Returns a Python string object holding the UTF-32 encoded value of 865 the Unicode data. 866 867 If byteorder is not 0, output is written according to the following 868 byte order: 869 870 byteorder == -1: little endian 871 byteorder == 0: native byte order (writes a BOM mark) 872 byteorder == 1: big endian 873 874 If byteorder is 0, the output string will always start with the 875 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 876 prepended. 877 878*/ 879 880PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 881 const Py_UNICODE *data, /* Unicode char buffer */ 882 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 883 const char *errors, /* error handling */ 884 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 885 ); 886 887/* --- UTF-16 Codecs ------------------------------------------------------ */ 888 889/* Decodes length bytes from a UTF-16 encoded buffer string and returns 890 the corresponding Unicode object. 891 892 errors (if non-NULL) defines the error handling. It defaults 893 to "strict". 894 895 If byteorder is non-NULL, the decoder starts decoding using the 896 given byte order: 897 898 *byteorder == -1: little endian 899 *byteorder == 0: native order 900 *byteorder == 1: big endian 901 902 In native mode, the first two bytes of the stream are checked for a 903 BOM mark. If found, the BOM mark is analysed, the byte order 904 adjusted and the BOM skipped. In the other modes, no BOM mark 905 interpretation is done. After completion, *byteorder is set to the 906 current byte order at the end of input data. 907 908 If byteorder is NULL, the codec starts in native order mode. 909 910*/ 911 912PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 913 const char *string, /* UTF-16 encoded string */ 914 Py_ssize_t length, /* size of string */ 915 const char *errors, /* error handling */ 916 int *byteorder /* pointer to byteorder to use 917 0=native;-1=LE,1=BE; updated on 918 exit */ 919 ); 920 921PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 922 const char *string, /* UTF-16 encoded string */ 923 Py_ssize_t length, /* size of string */ 924 const char *errors, /* error handling */ 925 int *byteorder, /* pointer to byteorder to use 926 0=native;-1=LE,1=BE; updated on 927 exit */ 928 Py_ssize_t *consumed /* bytes consumed */ 929 ); 930 931/* Returns a Python string using the UTF-16 encoding in native byte 932 order. The string always starts with a BOM mark. */ 933 934PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 935 PyObject *unicode /* Unicode object */ 936 ); 937 938/* Returns a Python string object holding the UTF-16 encoded value of 939 the Unicode data. 940 941 If byteorder is not 0, output is written according to the following 942 byte order: 943 944 byteorder == -1: little endian 945 byteorder == 0: native byte order (writes a BOM mark) 946 byteorder == 1: big endian 947 948 If byteorder is 0, the output string will always start with the 949 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 950 prepended. 951 952 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 953 UCS-2. This trick makes it possible to add full UTF-16 capabilities 954 at a later point without compromising the APIs. 955 956*/ 957 958PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 959 const Py_UNICODE *data, /* Unicode char buffer */ 960 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 961 const char *errors, /* error handling */ 962 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 963 ); 964 965/* --- Unicode-Escape Codecs ---------------------------------------------- */ 966 967PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 968 const char *string, /* Unicode-Escape encoded string */ 969 Py_ssize_t length, /* size of string */ 970 const char *errors /* error handling */ 971 ); 972 973PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 974 PyObject *unicode /* Unicode object */ 975 ); 976 977PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 978 const Py_UNICODE *data, /* Unicode char buffer */ 979 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 980 ); 981 982/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 983 984PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 985 const char *string, /* Raw-Unicode-Escape encoded string */ 986 Py_ssize_t length, /* size of string */ 987 const char *errors /* error handling */ 988 ); 989 990PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 991 PyObject *unicode /* Unicode object */ 992 ); 993 994PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 995 const Py_UNICODE *data, /* Unicode char buffer */ 996 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 997 ); 998 999/* --- Unicode Internal Codec --------------------------------------------- 1000 1001 Only for internal use in _codecsmodule.c */ 1002 1003PyObject *_PyUnicode_DecodeUnicodeInternal( 1004 const char *string, 1005 Py_ssize_t length, 1006 const char *errors 1007 ); 1008 1009/* --- Latin-1 Codecs ----------------------------------------------------- 1010 1011 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1012 1013*/ 1014 1015PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1016 const char *string, /* Latin-1 encoded string */ 1017 Py_ssize_t length, /* size of string */ 1018 const char *errors /* error handling */ 1019 ); 1020 1021PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1022 PyObject *unicode /* Unicode object */ 1023 ); 1024 1025PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1026 const Py_UNICODE *data, /* Unicode char buffer */ 1027 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1028 const char *errors /* error handling */ 1029 ); 1030 1031/* --- ASCII Codecs ------------------------------------------------------- 1032 1033 Only 7-bit ASCII data is excepted. All other codes generate errors. 1034 1035*/ 1036 1037PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1038 const char *string, /* ASCII encoded string */ 1039 Py_ssize_t length, /* size of string */ 1040 const char *errors /* error handling */ 1041 ); 1042 1043PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1044 PyObject *unicode /* Unicode object */ 1045 ); 1046 1047PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1048 const Py_UNICODE *data, /* Unicode char buffer */ 1049 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1050 const char *errors /* error handling */ 1051 ); 1052 1053/* --- Character Map Codecs ----------------------------------------------- 1054 1055 This codec uses mappings to encode and decode characters. 1056 1057 Decoding mappings must map single string characters to single 1058 Unicode characters, integers (which are then interpreted as Unicode 1059 ordinals) or None (meaning "undefined mapping" and causing an 1060 error). 1061 1062 Encoding mappings must map single Unicode characters to single 1063 string characters, integers (which are then interpreted as Latin-1 1064 ordinals) or None (meaning "undefined mapping" and causing an 1065 error). 1066 1067 If a character lookup fails with a LookupError, the character is 1068 copied as-is meaning that its ordinal value will be interpreted as 1069 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1070 to contain those mappings which map characters to different code 1071 points. 1072 1073*/ 1074 1075PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1076 const char *string, /* Encoded string */ 1077 Py_ssize_t length, /* size of string */ 1078 PyObject *mapping, /* character mapping 1079 (char ordinal -> unicode ordinal) */ 1080 const char *errors /* error handling */ 1081 ); 1082 1083PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1084 PyObject *unicode, /* Unicode object */ 1085 PyObject *mapping /* character mapping 1086 (unicode ordinal -> char ordinal) */ 1087 ); 1088 1089PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1090 const Py_UNICODE *data, /* Unicode char buffer */ 1091 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1092 PyObject *mapping, /* character mapping 1093 (unicode ordinal -> char ordinal) */ 1094 const char *errors /* error handling */ 1095 ); 1096 1097/* Translate a Py_UNICODE buffer of the given length by applying a 1098 character mapping table to it and return the resulting Unicode 1099 object. 1100 1101 The mapping table must map Unicode ordinal integers to Unicode 1102 ordinal integers or None (causing deletion of the character). 1103 1104 Mapping tables may be dictionaries or sequences. Unmapped character 1105 ordinals (ones which cause a LookupError) are left untouched and 1106 are copied as-is. 1107 1108*/ 1109 1110PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1111 const Py_UNICODE *data, /* Unicode char buffer */ 1112 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1113 PyObject *table, /* Translate table */ 1114 const char *errors /* error handling */ 1115 ); 1116 1117#ifdef MS_WIN32 1118 1119/* --- MBCS codecs for Windows -------------------------------------------- */ 1120 1121PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1122 const char *string, /* MBCS encoded string */ 1123 Py_ssize_t length, /* size of string */ 1124 const char *errors /* error handling */ 1125 ); 1126 1127PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1128 const char *string, /* MBCS encoded string */ 1129 Py_ssize_t length, /* size of string */ 1130 const char *errors, /* error handling */ 1131 Py_ssize_t *consumed /* bytes consumed */ 1132 ); 1133 1134PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1135 PyObject *unicode /* Unicode object */ 1136 ); 1137 1138PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1139 const Py_UNICODE *data, /* Unicode char buffer */ 1140 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1141 const char *errors /* error handling */ 1142 ); 1143 1144#endif /* MS_WIN32 */ 1145 1146/* --- Decimal Encoder ---------------------------------------------------- */ 1147 1148/* Takes a Unicode string holding a decimal value and writes it into 1149 an output buffer using standard ASCII digit codes. 1150 1151 The output buffer has to provide at least length+1 bytes of storage 1152 area. The output string is 0-terminated. 1153 1154 The encoder converts whitespace to ' ', decimal characters to their 1155 corresponding ASCII digit and all other Latin-1 characters except 1156 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1157 are treated as errors. This includes embedded NULL bytes. 1158 1159 Error handling is defined by the errors argument: 1160 1161 NULL or "strict": raise a ValueError 1162 "ignore": ignore the wrong characters (these are not copied to the 1163 output buffer) 1164 "replace": replaces illegal characters with '?' 1165 1166 Returns 0 on success, -1 on failure. 1167 1168*/ 1169 1170PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1171 Py_UNICODE *s, /* Unicode buffer */ 1172 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1173 char *output, /* Output buffer; must have size >= length */ 1174 const char *errors /* error handling */ 1175 ); 1176 1177/* --- Methods & Slots ---------------------------------------------------- 1178 1179 These are capable of handling Unicode objects and strings on input 1180 (we refer to them as strings in the descriptions) and return 1181 Unicode objects or integers as apporpriate. */ 1182 1183/* Concat two strings giving a new Unicode string. */ 1184 1185PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1186 PyObject *left, /* Left string */ 1187 PyObject *right /* Right string */ 1188 ); 1189 1190/* Concat two strings and put the result in *pleft 1191 (sets *pleft to NULL on error) */ 1192 1193PyAPI_FUNC(void) PyUnicode_Append( 1194 PyObject **pleft, /* Pointer to left string */ 1195 PyObject *right /* Right string */ 1196 ); 1197 1198/* Concat two strings, put the result in *pleft and drop the right object 1199 (sets *pleft to NULL on error) */ 1200 1201PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1202 PyObject **pleft, /* Pointer to left string */ 1203 PyObject *right /* Right string */ 1204 ); 1205 1206/* Split a string giving a list of Unicode strings. 1207 1208 If sep is NULL, splitting will be done at all whitespace 1209 substrings. Otherwise, splits occur at the given separator. 1210 1211 At most maxsplit splits will be done. If negative, no limit is set. 1212 1213 Separators are not included in the resulting list. 1214 1215*/ 1216 1217PyAPI_FUNC(PyObject*) PyUnicode_Split( 1218 PyObject *s, /* String to split */ 1219 PyObject *sep, /* String separator */ 1220 Py_ssize_t maxsplit /* Maxsplit count */ 1221 ); 1222 1223/* Dito, but split at line breaks. 1224 1225 CRLF is considered to be one line break. Line breaks are not 1226 included in the resulting list. */ 1227 1228PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1229 PyObject *s, /* String to split */ 1230 int keepends /* If true, line end markers are included */ 1231 ); 1232 1233/* Partition a string using a given separator. */ 1234 1235PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1236 PyObject *s, /* String to partition */ 1237 PyObject *sep /* String separator */ 1238 ); 1239 1240/* Partition a string using a given separator, searching from the end of the 1241 string. */ 1242 1243PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1244 PyObject *s, /* String to partition */ 1245 PyObject *sep /* String separator */ 1246 ); 1247 1248/* Split a string giving a list of Unicode strings. 1249 1250 If sep is NULL, splitting will be done at all whitespace 1251 substrings. Otherwise, splits occur at the given separator. 1252 1253 At most maxsplit splits will be done. But unlike PyUnicode_Split 1254 PyUnicode_RSplit splits from the end of the string. If negative, 1255 no limit is set. 1256 1257 Separators are not included in the resulting list. 1258 1259*/ 1260 1261PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1262 PyObject *s, /* String to split */ 1263 PyObject *sep, /* String separator */ 1264 Py_ssize_t maxsplit /* Maxsplit count */ 1265 ); 1266 1267/* Translate a string by applying a character mapping table to it and 1268 return the resulting Unicode object. 1269 1270 The mapping table must map Unicode ordinal integers to Unicode 1271 ordinal integers or None (causing deletion of the character). 1272 1273 Mapping tables may be dictionaries or sequences. Unmapped character 1274 ordinals (ones which cause a LookupError) are left untouched and 1275 are copied as-is. 1276 1277*/ 1278 1279PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1280 PyObject *str, /* String */ 1281 PyObject *table, /* Translate table */ 1282 const char *errors /* error handling */ 1283 ); 1284 1285/* Join a sequence of strings using the given separator and return 1286 the resulting Unicode string. */ 1287 1288PyAPI_FUNC(PyObject*) PyUnicode_Join( 1289 PyObject *separator, /* Separator string */ 1290 PyObject *seq /* Sequence object */ 1291 ); 1292 1293/* Return 1 if substr matches str[start:end] at the given tail end, 0 1294 otherwise. */ 1295 1296PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1297 PyObject *str, /* String */ 1298 PyObject *substr, /* Prefix or Suffix string */ 1299 Py_ssize_t start, /* Start index */ 1300 Py_ssize_t end, /* Stop index */ 1301 int direction /* Tail end: -1 prefix, +1 suffix */ 1302 ); 1303 1304/* Return the first position of substr in str[start:end] using the 1305 given search direction or -1 if not found. -2 is returned in case 1306 an error occurred and an exception is set. */ 1307 1308PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1309 PyObject *str, /* String */ 1310 PyObject *substr, /* Substring to find */ 1311 Py_ssize_t start, /* Start index */ 1312 Py_ssize_t end, /* Stop index */ 1313 int direction /* Find direction: +1 forward, -1 backward */ 1314 ); 1315 1316/* Count the number of occurrences of substr in str[start:end]. */ 1317 1318PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1319 PyObject *str, /* String */ 1320 PyObject *substr, /* Substring to count */ 1321 Py_ssize_t start, /* Start index */ 1322 Py_ssize_t end /* Stop index */ 1323 ); 1324 1325/* Replace at most maxcount occurrences of substr in str with replstr 1326 and return the resulting Unicode object. */ 1327 1328PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1329 PyObject *str, /* String */ 1330 PyObject *substr, /* Substring to find */ 1331 PyObject *replstr, /* Substring to replace */ 1332 Py_ssize_t maxcount /* Max. number of replacements to apply; 1333 -1 = all */ 1334 ); 1335 1336/* Compare two strings and return -1, 0, 1 for less than, equal, 1337 greater than resp. */ 1338 1339PyAPI_FUNC(int) PyUnicode_Compare( 1340 PyObject *left, /* Left string */ 1341 PyObject *right /* Right string */ 1342 ); 1343 1344PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1345 PyObject *left, 1346 const char *right 1347 ); 1348 1349/* Rich compare two strings and return one of the following: 1350 1351 - NULL in case an exception was raised 1352 - Py_True or Py_False for successfuly comparisons 1353 - Py_NotImplemented in case the type combination is unknown 1354 1355 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1356 case the conversion of the arguments to Unicode fails with a 1357 UnicodeDecodeError. 1358 1359 Possible values for op: 1360 1361 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1362 1363*/ 1364 1365PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1366 PyObject *left, /* Left string */ 1367 PyObject *right, /* Right string */ 1368 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1369 ); 1370 1371/* Apply a argument tuple or dictionary to a format string and return 1372 the resulting Unicode string. */ 1373 1374PyAPI_FUNC(PyObject *) PyUnicode_Format( 1375 PyObject *format, /* Format string */ 1376 PyObject *args /* Argument tuple or dictionary */ 1377 ); 1378 1379/* Checks whether element is contained in container and return 1/0 1380 accordingly. 1381 1382 element has to coerce to an one element Unicode string. -1 is 1383 returned in case of an error. */ 1384 1385PyAPI_FUNC(int) PyUnicode_Contains( 1386 PyObject *container, /* Container string */ 1387 PyObject *element /* Element string */ 1388 ); 1389 1390/* Checks whether argument is a valid identifier. */ 1391 1392PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1393 1394/* Externally visible for str.strip(unicode) */ 1395PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1396 PyUnicodeObject *self, 1397 int striptype, 1398 PyObject *sepobj 1399 ); 1400 1401/* === Characters Type APIs =============================================== */ 1402 1403/* These should not be used directly. Use the Py_UNICODE_IS* and 1404 Py_UNICODE_TO* macros instead. 1405 1406 These APIs are implemented in Objects/unicodectype.c. 1407 1408*/ 1409 1410PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1411 Py_UNICODE ch /* Unicode character */ 1412 ); 1413 1414PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1415 Py_UNICODE ch /* Unicode character */ 1416 ); 1417 1418PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1419 Py_UNICODE ch /* Unicode character */ 1420 ); 1421 1422PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1423 Py_UNICODE ch /* Unicode character */ 1424 ); 1425 1426PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1427 Py_UNICODE ch /* Unicode character */ 1428 ); 1429 1430PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1431 const Py_UNICODE ch /* Unicode character */ 1432 ); 1433 1434PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1435 const Py_UNICODE ch /* Unicode character */ 1436 ); 1437 1438PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase( 1439 Py_UNICODE ch /* Unicode character */ 1440 ); 1441 1442PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase( 1443 Py_UNICODE ch /* Unicode character */ 1444 ); 1445 1446PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase( 1447 Py_UNICODE ch /* Unicode character */ 1448 ); 1449 1450PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1451 Py_UNICODE ch /* Unicode character */ 1452 ); 1453 1454PyAPI_FUNC(int) _PyUnicode_ToDigit( 1455 Py_UNICODE ch /* Unicode character */ 1456 ); 1457 1458PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1459 Py_UNICODE ch /* Unicode character */ 1460 ); 1461 1462PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1463 Py_UNICODE ch /* Unicode character */ 1464 ); 1465 1466PyAPI_FUNC(int) _PyUnicode_IsDigit( 1467 Py_UNICODE ch /* Unicode character */ 1468 ); 1469 1470PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1471 Py_UNICODE ch /* Unicode character */ 1472 ); 1473 1474PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1475 Py_UNICODE ch /* Unicode character */ 1476 ); 1477 1478PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u); 1479 1480PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1481 Py_UNICODE *s1, const Py_UNICODE *s2); 1482 1483PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1484 Py_UNICODE *s1, const Py_UNICODE *s2, size_t n); 1485 1486PyAPI_FUNC(int) Py_UNICODE_strcmp( 1487 const Py_UNICODE *s1, const Py_UNICODE *s2); 1488 1489PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1490 const Py_UNICODE *s, Py_UNICODE c 1491 ); 1492 1493#ifdef __cplusplus 1494} 1495#endif 1496#endif /* !Py_UNICODEOBJECT_H */ 1497