unicodeobject.h revision 7ade6485abde95c5cc9676ad3e476ba3aca98037
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4/* 5 6Unicode implementation based on original code by Fredrik Lundh, 7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 8Unicode Integration Proposal (see file Misc/unicode.txt). 9 10Copyright (c) Corporation for National Research Initiatives. 11 12 13 Original header: 14 -------------------------------------------------------------------- 15 16 * Yet another Unicode string type for Python. This type supports the 17 * 16-bit Basic Multilingual Plane (BMP) only. 18 * 19 * Written by Fredrik Lundh, January 1999. 20 * 21 * Copyright (c) 1999 by Secret Labs AB. 22 * Copyright (c) 1999 by Fredrik Lundh. 23 * 24 * fredrik@pythonware.com 25 * http://www.pythonware.com 26 * 27 * -------------------------------------------------------------------- 28 * This Unicode String Type is 29 * 30 * Copyright (c) 1999 by Secret Labs AB 31 * Copyright (c) 1999 by Fredrik Lundh 32 * 33 * By obtaining, using, and/or copying this software and/or its 34 * associated documentation, you agree that you have read, understood, 35 * and will comply with the following terms and conditions: 36 * 37 * Permission to use, copy, modify, and distribute this software and its 38 * associated documentation for any purpose and without fee is hereby 39 * granted, provided that the above copyright notice appears in all 40 * copies, and that both that copyright notice and this permission notice 41 * appear in supporting documentation, and that the name of Secret Labs 42 * AB or the author not be used in advertising or publicity pertaining to 43 * distribution of the software without specific, written prior 44 * permission. 45 * 46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 53 * -------------------------------------------------------------------- */ 54 55#include <ctype.h> 56 57/* === Internal API ======================================================= */ 58 59/* --- Internal Unicode Format -------------------------------------------- */ 60 61/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 62 properly set, but the default rules below doesn't set it. I'll 63 sort this out some other day -- fredrik@pythonware.com */ 64 65#ifndef Py_UNICODE_SIZE 66#error Must define Py_UNICODE_SIZE 67#endif 68 69/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 70 strings are stored as UCS-2 (with limited support for UTF-16) */ 71 72#if Py_UNICODE_SIZE >= 4 73#define Py_UNICODE_WIDE 74#endif 75 76/* Set these flags if the platform has "wchar.h", "wctype.h" and the 77 wchar_t type is a 16-bit unsigned type */ 78/* #define HAVE_WCHAR_H */ 79/* #define HAVE_USABLE_WCHAR_T */ 80 81/* Defaults for various platforms */ 82#ifndef PY_UNICODE_TYPE 83 84/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 85# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 86# define HAVE_USABLE_WCHAR_T 87# define PY_UNICODE_TYPE wchar_t 88# endif 89 90# if defined(Py_UNICODE_WIDE) 91# define PY_UNICODE_TYPE Py_UCS4 92# endif 93 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar() and 98 PyUnicode_AsWideChar(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#ifdef HAVE_WCHAR_H 107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 108# ifdef _HAVE_BSDI 109# include <time.h> 110# endif 111# include <wchar.h> 112#endif 113 114/* 115 * Use this typedef when you need to represent a UTF-16 surrogate pair 116 * as single unsigned integer. 117 */ 118#if SIZEOF_INT >= 4 119typedef unsigned int Py_UCS4; 120#elif SIZEOF_LONG >= 4 121typedef unsigned long Py_UCS4; 122#endif 123 124typedef PY_UNICODE_TYPE Py_UNICODE; 125 126/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 127 128/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 129 produce different external names and thus cause import errors in 130 case Python interpreters and extensions with mixed compiled in 131 Unicode width assumptions are combined. */ 132 133#ifndef Py_UNICODE_WIDE 134 135# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 136# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 137# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject 138# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 139# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 140# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 141# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String 142# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 143# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 144# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 145# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 146# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 147# define PyUnicode_Compare PyUnicodeUCS2_Compare 148# define PyUnicode_Concat PyUnicodeUCS2_Concat 149# define PyUnicode_Append PyUnicodeUCS2_Append 150# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel 151# define PyUnicode_Contains PyUnicodeUCS2_Contains 152# define PyUnicode_Count PyUnicodeUCS2_Count 153# define PyUnicode_Decode PyUnicodeUCS2_Decode 154# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 155# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 156# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 157# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 158# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 159# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful 160# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 161# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful 162# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 163# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful 164# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 165# define PyUnicode_Encode PyUnicodeUCS2_Encode 166# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 167# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 168# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 169# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 170# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 171# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 172# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 173# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 174# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 175# define PyUnicode_Find PyUnicodeUCS2_Find 176# define PyUnicode_Format PyUnicodeUCS2_Format 177# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 178# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 179# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal 180# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 181# define PyUnicode_FromString PyUnicodeUCS2_FromString 182# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize 183# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV 184# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat 185# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 186# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 187# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 188# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 189# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier 190# define PyUnicode_Join PyUnicodeUCS2_Join 191# define PyUnicode_Partition PyUnicodeUCS2_Partition 192# define PyUnicode_RPartition PyUnicodeUCS2_RPartition 193# define PyUnicode_RSplit PyUnicodeUCS2_RSplit 194# define PyUnicode_Replace PyUnicodeUCS2_Replace 195# define PyUnicode_Resize PyUnicodeUCS2_Resize 196# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare 197# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding 198# define PyUnicode_Split PyUnicodeUCS2_Split 199# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 200# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 201# define PyUnicode_Translate PyUnicodeUCS2_Translate 202# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 203# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 204# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 205# define _PyUnicode_Init _PyUnicodeUCS2_Init 206# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha 207# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit 208# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit 209# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak 210# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase 211# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric 212# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase 213# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart 214# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue 215# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase 216# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace 217# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit 218# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit 219# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase 220# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric 221# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase 222# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase 223 224#else 225 226# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 227# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 228# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject 229# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 230# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 231# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 232# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String 233# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 234# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 235# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 236# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 237# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 238# define PyUnicode_Compare PyUnicodeUCS4_Compare 239# define PyUnicode_Concat PyUnicodeUCS4_Concat 240# define PyUnicode_Append PyUnicodeUCS4_Append 241# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel 242# define PyUnicode_Contains PyUnicodeUCS4_Contains 243# define PyUnicode_Count PyUnicodeUCS4_Count 244# define PyUnicode_Decode PyUnicodeUCS4_Decode 245# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 246# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 247# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 248# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 249# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 250# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful 251# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 252# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful 253# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 254# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful 255# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 256# define PyUnicode_Encode PyUnicodeUCS4_Encode 257# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 258# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 259# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 260# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 261# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 262# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 263# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 264# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 265# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 266# define PyUnicode_Find PyUnicodeUCS4_Find 267# define PyUnicode_Format PyUnicodeUCS4_Format 268# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 269# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 270# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal 271# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 272# define PyUnicode_FromString PyUnicodeUCS4_FromString 273# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize 274# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV 275# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat 276# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 277# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 278# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 279# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 280# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier 281# define PyUnicode_Join PyUnicodeUCS4_Join 282# define PyUnicode_Partition PyUnicodeUCS4_Partition 283# define PyUnicode_RPartition PyUnicodeUCS4_RPartition 284# define PyUnicode_RSplit PyUnicodeUCS4_RSplit 285# define PyUnicode_Replace PyUnicodeUCS4_Replace 286# define PyUnicode_Resize PyUnicodeUCS4_Resize 287# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare 288# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding 289# define PyUnicode_Split PyUnicodeUCS4_Split 290# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 291# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 292# define PyUnicode_Translate PyUnicodeUCS4_Translate 293# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 294# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 295# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 296# define _PyUnicode_Init _PyUnicodeUCS4_Init 297# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha 298# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit 299# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit 300# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak 301# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase 302# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric 303# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase 304# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart 305# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue 306# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase 307# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace 308# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit 309# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit 310# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase 311# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric 312# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase 313# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase 314 315 316#endif 317 318/* --- Internal Unicode Operations ---------------------------------------- */ 319 320/* If you want Python to use the compiler's wctype.h functions instead 321 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or 322 configure Python using --with-wctype-functions. This reduces the 323 interpreter's code size. */ 324 325#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) 326 327#include <wctype.h> 328 329#define Py_UNICODE_ISSPACE(ch) iswspace(ch) 330 331#define Py_UNICODE_ISLOWER(ch) iswlower(ch) 332#define Py_UNICODE_ISUPPER(ch) iswupper(ch) 333#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 334#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 335 336#define Py_UNICODE_TOLOWER(ch) towlower(ch) 337#define Py_UNICODE_TOUPPER(ch) towupper(ch) 338#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 339 340#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 341#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 342#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 343 344#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 345#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 346#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 347 348#define Py_UNICODE_ISALPHA(ch) iswalpha(ch) 349 350#else 351 352#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) 353 354#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 355#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 356#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 357#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 358 359#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 360#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 361#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 362 363#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 364#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 365#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 366 367#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 368#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 369#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 370 371#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 372 373#endif 374 375#define Py_UNICODE_ISALNUM(ch) \ 376 (Py_UNICODE_ISALPHA(ch) || \ 377 Py_UNICODE_ISDECIMAL(ch) || \ 378 Py_UNICODE_ISDIGIT(ch) || \ 379 Py_UNICODE_ISNUMERIC(ch)) 380 381#define Py_UNICODE_COPY(target, source, length) \ 382 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 383 384#define Py_UNICODE_FILL(target, value, length) do\ 385 {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 386 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 387 } while (0) 388 389/* check if substring matches at given offset. the offset must be 390 valid, and the substring must not be empty */ 391#define Py_UNICODE_MATCH(string, offset, substring) \ 392 ((*((string)->str + (offset)) == *((substring)->str)) && \ 393 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ 394 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) 395 396#ifdef __cplusplus 397extern "C" { 398#endif 399 400/* --- Unicode Type ------------------------------------------------------- */ 401 402typedef struct { 403 PyObject_HEAD 404 Py_ssize_t length; /* Length of raw Unicode data in buffer */ 405 Py_UNICODE *str; /* Raw Unicode buffer */ 406 long hash; /* Hash value; -1 if not set */ 407 int state; /* != 0 if interned. In this case the two 408 * references from the dictionary to this object 409 * are *not* counted in ob_refcnt. */ 410 PyObject *defenc; /* (Default) Encoded version as Python 411 string, or NULL; this is used for 412 implementing the buffer protocol */ 413} PyUnicodeObject; 414 415PyAPI_DATA(PyTypeObject) PyUnicode_Type; 416 417#define SSTATE_NOT_INTERNED 0 418#define SSTATE_INTERNED_MORTAL 1 419#define SSTATE_INTERNED_IMMORTAL 2 420 421#define PyUnicode_Check(op) \ 422 PyType_FastSubclass(Py_Type(op), Py_TPFLAGS_UNICODE_SUBCLASS) 423#define PyUnicode_CheckExact(op) (Py_Type(op) == &PyUnicode_Type) 424 425/* Fast access macros */ 426#define PyUnicode_GET_SIZE(op) \ 427 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) 428#define PyUnicode_GET_DATA_SIZE(op) \ 429 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) 430#define PyUnicode_AS_UNICODE(op) \ 431 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) 432#define PyUnicode_AS_DATA(op) \ 433 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str)) 434 435/* --- Constants ---------------------------------------------------------- */ 436 437/* This Unicode character will be used as replacement character during 438 decoding if the errors argument is set to "replace". Note: the 439 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 440 Unicode 3.0. */ 441 442#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 443 444/* === Public API ========================================================= */ 445 446/* --- Plain Py_UNICODE --------------------------------------------------- */ 447 448/* Create a Unicode Object from the Py_UNICODE buffer u of the given 449 size. 450 451 u may be NULL which causes the contents to be undefined. It is the 452 user's responsibility to fill in the needed data afterwards. Note 453 that modifying the Unicode object contents after construction is 454 only allowed if u was set to NULL. 455 456 The buffer is copied into the new object. */ 457 458PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 459 const Py_UNICODE *u, /* Unicode buffer */ 460 Py_ssize_t size /* size of buffer */ 461 ); 462 463/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */ 464PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 465 const char *u, /* char buffer */ 466 Py_ssize_t size /* size of buffer */ 467 ); 468 469/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 470 Latin-1 encoded bytes */ 471PyAPI_FUNC(PyObject*) PyUnicode_FromString( 472 const char *u /* string */ 473 ); 474 475/* Return a read-only pointer to the Unicode object's internal 476 Py_UNICODE buffer. */ 477 478PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 479 PyObject *unicode /* Unicode object */ 480 ); 481 482/* Get the length of the Unicode object. */ 483 484PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 485 PyObject *unicode /* Unicode object */ 486 ); 487 488/* Get the maximum ordinal for a Unicode character. */ 489PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 490 491/* Resize an already allocated Unicode object to the new size length. 492 493 *unicode is modified to point to the new (resized) object and 0 494 returned on success. 495 496 This API may only be called by the function which also called the 497 Unicode constructor. The refcount on the object must be 1. Otherwise, 498 an error is returned. 499 500 Error handling is implemented as follows: an exception is set, -1 501 is returned and *unicode left untouched. 502 503*/ 504 505PyAPI_FUNC(int) PyUnicode_Resize( 506 PyObject **unicode, /* Pointer to the Unicode object */ 507 Py_ssize_t length /* New length */ 508 ); 509 510/* Coerce obj to an Unicode object and return a reference with 511 *incremented* refcount. 512 513 Coercion is done in the following way: 514 515 1. String and other char buffer compatible objects are decoded 516 under the assumptions that they contain data using the current 517 default encoding. Decoding is done in "strict" mode. 518 519 2. All other objects (including Unicode objects) raise an 520 exception. 521 522 The API returns NULL in case of an error. The caller is responsible 523 for decref'ing the returned objects. 524 525*/ 526 527PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 528 register PyObject *obj, /* Object */ 529 const char *encoding, /* encoding */ 530 const char *errors /* error handling */ 531 ); 532 533/* Coerce obj to an Unicode object and return a reference with 534 *incremented* refcount. 535 536 Unicode objects are passed back as-is (subclasses are converted to 537 true Unicode objects), all other objects are delegated to 538 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 539 using the default encoding as basis for decoding the object. 540 541 The API returns NULL in case of an error. The caller is responsible 542 for decref'ing the returned objects. 543 544*/ 545 546PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 547 register PyObject *obj /* Object */ 548 ); 549 550PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list); 551PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...); 552 553PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 554PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 555PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *); 556PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 557 558/* Use only if you know it's a string */ 559#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) 560 561/* --- wchar_t support for platforms which support it --------------------- */ 562 563#ifdef HAVE_WCHAR_H 564 565/* Create a Unicode Object from the whcar_t buffer w of the given 566 size. 567 568 The buffer is copied into the new object. */ 569 570PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 571 register const wchar_t *w, /* wchar_t buffer */ 572 Py_ssize_t size /* size of buffer */ 573 ); 574 575/* Copies the Unicode Object contents into the wchar_t buffer w. At 576 most size wchar_t characters are copied. 577 578 Note that the resulting wchar_t string may or may not be 579 0-terminated. It is the responsibility of the caller to make sure 580 that the wchar_t string is 0-terminated in case this is required by 581 the application. 582 583 Returns the number of wchar_t characters copied (excluding a 584 possibly trailing 0-termination character) or -1 in case of an 585 error. */ 586 587PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 588 PyUnicodeObject *unicode, /* Unicode object */ 589 register wchar_t *w, /* wchar_t buffer */ 590 Py_ssize_t size /* size of buffer */ 591 ); 592 593#endif 594 595/* --- Unicode ordinals --------------------------------------------------- */ 596 597/* Create a Unicode Object from the given Unicode code point ordinal. 598 599 The ordinal must be in range(0x10000) on narrow Python builds 600 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 601 raised in case it is not. 602 603*/ 604 605PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 606 607/* === Builtin Codecs ===================================================== 608 609 Many of these APIs take two arguments encoding and errors. These 610 parameters encoding and errors have the same semantics as the ones 611 of the builtin unicode() API. 612 613 Setting encoding to NULL causes the default encoding to be used. 614 615 Error handling is set by errors which may also be set to NULL 616 meaning to use the default handling defined for the codec. Default 617 error handling for all builtin codecs is "strict" (ValueErrors are 618 raised). 619 620 The codecs all use a similar interface. Only deviation from the 621 generic ones are documented. 622 623*/ 624 625/* --- Manage the default encoding ---------------------------------------- */ 626 627/* Return a Python string holding the default encoded value of the 628 Unicode object. 629 630 The resulting string is cached in the Unicode object for subsequent 631 usage by this function. The cached version is needed to implement 632 the character buffer interface and will live (at least) as long as 633 the Unicode object itself. 634 635 The refcount of the string is *not* incremented. 636 637 *** Exported for internal use by the interpreter only !!! *** 638 639*/ 640 641PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( 642 PyObject *, const char *); 643 644/* Return a char* holding the default encoded value of the 645 Unicode object. 646*/ 647 648PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*); 649 650 651/* Returns the currently active default encoding. 652 653 The default encoding is currently implemented as run-time settable 654 process global. This may change in future versions of the 655 interpreter to become a parameter which is managed on a per-thread 656 basis. 657 658 */ 659 660PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 661 662/* Sets the currently active default encoding. 663 664 Returns 0 on success, -1 in case of an error. 665 666 */ 667 668PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding( 669 const char *encoding /* Encoding name in standard form */ 670 ); 671 672/* --- Generic Codecs ----------------------------------------------------- */ 673 674/* Create a Unicode object by decoding the encoded string s of the 675 given size. */ 676 677PyAPI_FUNC(PyObject*) PyUnicode_Decode( 678 const char *s, /* encoded string */ 679 Py_ssize_t size, /* size of buffer */ 680 const char *encoding, /* encoding */ 681 const char *errors /* error handling */ 682 ); 683 684/* Encodes a Py_UNICODE buffer of the given size and returns a 685 Python string object. */ 686 687PyAPI_FUNC(PyObject*) PyUnicode_Encode( 688 const Py_UNICODE *s, /* Unicode char buffer */ 689 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 690 const char *encoding, /* encoding */ 691 const char *errors /* error handling */ 692 ); 693 694/* Encodes a Unicode object and returns the result as Python 695 object. */ 696 697PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 698 PyObject *unicode, /* Unicode object */ 699 const char *encoding, /* encoding */ 700 const char *errors /* error handling */ 701 ); 702 703/* Encodes a Unicode object and returns the result as Python string 704 object. */ 705 706PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 707 PyObject *unicode, /* Unicode object */ 708 const char *encoding, /* encoding */ 709 const char *errors /* error handling */ 710 ); 711 712PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 713 PyObject* string /* 256 character map */ 714 ); 715 716 717/* --- UTF-7 Codecs ------------------------------------------------------- */ 718 719PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 720 const char *string, /* UTF-7 encoded string */ 721 Py_ssize_t length, /* size of string */ 722 const char *errors /* error handling */ 723 ); 724 725PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 726 const Py_UNICODE *data, /* Unicode char buffer */ 727 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 728 int encodeSetO, /* force the encoder to encode characters in 729 Set O, as described in RFC2152 */ 730 int encodeWhiteSpace, /* force the encoder to encode space, tab, 731 carriage return and linefeed characters */ 732 const char *errors /* error handling */ 733 ); 734 735/* --- UTF-8 Codecs ------------------------------------------------------- */ 736 737PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 738 const char *string, /* UTF-8 encoded string */ 739 Py_ssize_t length, /* size of string */ 740 const char *errors /* error handling */ 741 ); 742 743PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 744 const char *string, /* UTF-8 encoded string */ 745 Py_ssize_t length, /* size of string */ 746 const char *errors, /* error handling */ 747 Py_ssize_t *consumed /* bytes consumed */ 748 ); 749 750PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 751 PyObject *unicode /* Unicode object */ 752 ); 753 754PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 755 const Py_UNICODE *data, /* Unicode char buffer */ 756 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 757 const char *errors /* error handling */ 758 ); 759 760/* --- UTF-32 Codecs ------------------------------------------------------ */ 761 762/* Decodes length bytes from a UTF-32 encoded buffer string and returns 763 the corresponding Unicode object. 764 765 errors (if non-NULL) defines the error handling. It defaults 766 to "strict". 767 768 If byteorder is non-NULL, the decoder starts decoding using the 769 given byte order: 770 771 *byteorder == -1: little endian 772 *byteorder == 0: native order 773 *byteorder == 1: big endian 774 775 In native mode, the first four bytes of the stream are checked for a 776 BOM mark. If found, the BOM mark is analysed, the byte order 777 adjusted and the BOM skipped. In the other modes, no BOM mark 778 interpretation is done. After completion, *byteorder is set to the 779 current byte order at the end of input data. 780 781 If byteorder is NULL, the codec starts in native order mode. 782 783*/ 784 785PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 786 const char *string, /* UTF-32 encoded string */ 787 Py_ssize_t length, /* size of string */ 788 const char *errors, /* error handling */ 789 int *byteorder /* pointer to byteorder to use 790 0=native;-1=LE,1=BE; updated on 791 exit */ 792 ); 793 794PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 795 const char *string, /* UTF-32 encoded string */ 796 Py_ssize_t length, /* size of string */ 797 const char *errors, /* error handling */ 798 int *byteorder, /* pointer to byteorder to use 799 0=native;-1=LE,1=BE; updated on 800 exit */ 801 Py_ssize_t *consumed /* bytes consumed */ 802 ); 803 804/* Returns a Python string using the UTF-32 encoding in native byte 805 order. The string always starts with a BOM mark. */ 806 807PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 808 PyObject *unicode /* Unicode object */ 809 ); 810 811/* Returns a Python string object holding the UTF-32 encoded value of 812 the Unicode data. 813 814 If byteorder is not 0, output is written according to the following 815 byte order: 816 817 byteorder == -1: little endian 818 byteorder == 0: native byte order (writes a BOM mark) 819 byteorder == 1: big endian 820 821 If byteorder is 0, the output string will always start with the 822 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 823 prepended. 824 825*/ 826 827PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 828 const Py_UNICODE *data, /* Unicode char buffer */ 829 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 830 const char *errors, /* error handling */ 831 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 832 ); 833 834/* --- UTF-16 Codecs ------------------------------------------------------ */ 835 836/* Decodes length bytes from a UTF-16 encoded buffer string and returns 837 the corresponding Unicode object. 838 839 errors (if non-NULL) defines the error handling. It defaults 840 to "strict". 841 842 If byteorder is non-NULL, the decoder starts decoding using the 843 given byte order: 844 845 *byteorder == -1: little endian 846 *byteorder == 0: native order 847 *byteorder == 1: big endian 848 849 In native mode, the first two bytes of the stream are checked for a 850 BOM mark. If found, the BOM mark is analysed, the byte order 851 adjusted and the BOM skipped. In the other modes, no BOM mark 852 interpretation is done. After completion, *byteorder is set to the 853 current byte order at the end of input data. 854 855 If byteorder is NULL, the codec starts in native order mode. 856 857*/ 858 859PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 860 const char *string, /* UTF-16 encoded string */ 861 Py_ssize_t length, /* size of string */ 862 const char *errors, /* error handling */ 863 int *byteorder /* pointer to byteorder to use 864 0=native;-1=LE,1=BE; updated on 865 exit */ 866 ); 867 868PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 869 const char *string, /* UTF-16 encoded string */ 870 Py_ssize_t length, /* size of string */ 871 const char *errors, /* error handling */ 872 int *byteorder, /* pointer to byteorder to use 873 0=native;-1=LE,1=BE; updated on 874 exit */ 875 Py_ssize_t *consumed /* bytes consumed */ 876 ); 877 878/* Returns a Python string using the UTF-16 encoding in native byte 879 order. The string always starts with a BOM mark. */ 880 881PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 882 PyObject *unicode /* Unicode object */ 883 ); 884 885/* Returns a Python string object holding the UTF-16 encoded value of 886 the Unicode data. 887 888 If byteorder is not 0, output is written according to the following 889 byte order: 890 891 byteorder == -1: little endian 892 byteorder == 0: native byte order (writes a BOM mark) 893 byteorder == 1: big endian 894 895 If byteorder is 0, the output string will always start with the 896 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 897 prepended. 898 899 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 900 UCS-2. This trick makes it possible to add full UTF-16 capabilities 901 at a later point without compromising the APIs. 902 903*/ 904 905PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 906 const Py_UNICODE *data, /* Unicode char buffer */ 907 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 908 const char *errors, /* error handling */ 909 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 910 ); 911 912/* --- Unicode-Escape Codecs ---------------------------------------------- */ 913 914PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 915 const char *string, /* Unicode-Escape encoded string */ 916 Py_ssize_t length, /* size of string */ 917 const char *errors /* error handling */ 918 ); 919 920PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 921 PyObject *unicode /* Unicode object */ 922 ); 923 924PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 925 const Py_UNICODE *data, /* Unicode char buffer */ 926 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 927 ); 928 929/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 930 931PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 932 const char *string, /* Raw-Unicode-Escape encoded string */ 933 Py_ssize_t length, /* size of string */ 934 const char *errors /* error handling */ 935 ); 936 937PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 938 PyObject *unicode /* Unicode object */ 939 ); 940 941PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 942 const Py_UNICODE *data, /* Unicode char buffer */ 943 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 944 ); 945 946/* --- Unicode Internal Codec --------------------------------------------- 947 948 Only for internal use in _codecsmodule.c */ 949 950PyObject *_PyUnicode_DecodeUnicodeInternal( 951 const char *string, 952 Py_ssize_t length, 953 const char *errors 954 ); 955 956/* --- Latin-1 Codecs ----------------------------------------------------- 957 958 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 959 960*/ 961 962PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 963 const char *string, /* Latin-1 encoded string */ 964 Py_ssize_t length, /* size of string */ 965 const char *errors /* error handling */ 966 ); 967 968PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 969 PyObject *unicode /* Unicode object */ 970 ); 971 972PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 973 const Py_UNICODE *data, /* Unicode char buffer */ 974 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 975 const char *errors /* error handling */ 976 ); 977 978/* --- ASCII Codecs ------------------------------------------------------- 979 980 Only 7-bit ASCII data is excepted. All other codes generate errors. 981 982*/ 983 984PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 985 const char *string, /* ASCII encoded string */ 986 Py_ssize_t length, /* size of string */ 987 const char *errors /* error handling */ 988 ); 989 990PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 991 PyObject *unicode /* Unicode object */ 992 ); 993 994PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 995 const Py_UNICODE *data, /* Unicode char buffer */ 996 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 997 const char *errors /* error handling */ 998 ); 999 1000/* --- Character Map Codecs ----------------------------------------------- 1001 1002 This codec uses mappings to encode and decode characters. 1003 1004 Decoding mappings must map single string characters to single 1005 Unicode characters, integers (which are then interpreted as Unicode 1006 ordinals) or None (meaning "undefined mapping" and causing an 1007 error). 1008 1009 Encoding mappings must map single Unicode characters to single 1010 string characters, integers (which are then interpreted as Latin-1 1011 ordinals) or None (meaning "undefined mapping" and causing an 1012 error). 1013 1014 If a character lookup fails with a LookupError, the character is 1015 copied as-is meaning that its ordinal value will be interpreted as 1016 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1017 to contain those mappings which map characters to different code 1018 points. 1019 1020*/ 1021 1022PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1023 const char *string, /* Encoded string */ 1024 Py_ssize_t length, /* size of string */ 1025 PyObject *mapping, /* character mapping 1026 (char ordinal -> unicode ordinal) */ 1027 const char *errors /* error handling */ 1028 ); 1029 1030PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1031 PyObject *unicode, /* Unicode object */ 1032 PyObject *mapping /* character mapping 1033 (unicode ordinal -> char ordinal) */ 1034 ); 1035 1036PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1037 const Py_UNICODE *data, /* Unicode char buffer */ 1038 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1039 PyObject *mapping, /* character mapping 1040 (unicode ordinal -> char ordinal) */ 1041 const char *errors /* error handling */ 1042 ); 1043 1044/* Translate a Py_UNICODE buffer of the given length by applying a 1045 character mapping table to it and return the resulting Unicode 1046 object. 1047 1048 The mapping table must map Unicode ordinal integers to Unicode 1049 ordinal integers or None (causing deletion of the character). 1050 1051 Mapping tables may be dictionaries or sequences. Unmapped character 1052 ordinals (ones which cause a LookupError) are left untouched and 1053 are copied as-is. 1054 1055*/ 1056 1057PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1058 const Py_UNICODE *data, /* Unicode char buffer */ 1059 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1060 PyObject *table, /* Translate table */ 1061 const char *errors /* error handling */ 1062 ); 1063 1064#ifdef MS_WIN32 1065 1066/* --- MBCS codecs for Windows -------------------------------------------- */ 1067 1068PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1069 const char *string, /* MBCS encoded string */ 1070 Py_ssize_t length, /* size of string */ 1071 const char *errors /* error handling */ 1072 ); 1073 1074PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1075 const char *string, /* MBCS encoded string */ 1076 Py_ssize_t length, /* size of string */ 1077 const char *errors, /* error handling */ 1078 Py_ssize_t *consumed /* bytes consumed */ 1079 ); 1080 1081PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1082 PyObject *unicode /* Unicode object */ 1083 ); 1084 1085PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1086 const Py_UNICODE *data, /* Unicode char buffer */ 1087 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1088 const char *errors /* error handling */ 1089 ); 1090 1091#endif /* MS_WIN32 */ 1092 1093/* --- Decimal Encoder ---------------------------------------------------- */ 1094 1095/* Takes a Unicode string holding a decimal value and writes it into 1096 an output buffer using standard ASCII digit codes. 1097 1098 The output buffer has to provide at least length+1 bytes of storage 1099 area. The output string is 0-terminated. 1100 1101 The encoder converts whitespace to ' ', decimal characters to their 1102 corresponding ASCII digit and all other Latin-1 characters except 1103 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1104 are treated as errors. This includes embedded NULL bytes. 1105 1106 Error handling is defined by the errors argument: 1107 1108 NULL or "strict": raise a ValueError 1109 "ignore": ignore the wrong characters (these are not copied to the 1110 output buffer) 1111 "replace": replaces illegal characters with '?' 1112 1113 Returns 0 on success, -1 on failure. 1114 1115*/ 1116 1117PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1118 Py_UNICODE *s, /* Unicode buffer */ 1119 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1120 char *output, /* Output buffer; must have size >= length */ 1121 const char *errors /* error handling */ 1122 ); 1123 1124/* --- Methods & Slots ---------------------------------------------------- 1125 1126 These are capable of handling Unicode objects and strings on input 1127 (we refer to them as strings in the descriptions) and return 1128 Unicode objects or integers as apporpriate. */ 1129 1130/* Concat two strings giving a new Unicode string. */ 1131 1132PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1133 PyObject *left, /* Left string */ 1134 PyObject *right /* Right string */ 1135 ); 1136 1137/* Concat two strings and put the result in *pleft 1138 (sets *pleft to NULL on error) */ 1139 1140PyAPI_FUNC(void) PyUnicode_Append( 1141 PyObject **pleft, /* Pointer to left string */ 1142 PyObject *right /* Right string */ 1143 ); 1144 1145/* Concat two strings, put the result in *pleft and drop the right object 1146 (sets *pleft to NULL on error) */ 1147 1148PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1149 PyObject **pleft, /* Pointer to left string */ 1150 PyObject *right /* Right string */ 1151 ); 1152 1153/* Split a string giving a list of Unicode strings. 1154 1155 If sep is NULL, splitting will be done at all whitespace 1156 substrings. Otherwise, splits occur at the given separator. 1157 1158 At most maxsplit splits will be done. If negative, no limit is set. 1159 1160 Separators are not included in the resulting list. 1161 1162*/ 1163 1164PyAPI_FUNC(PyObject*) PyUnicode_Split( 1165 PyObject *s, /* String to split */ 1166 PyObject *sep, /* String separator */ 1167 Py_ssize_t maxsplit /* Maxsplit count */ 1168 ); 1169 1170/* Dito, but split at line breaks. 1171 1172 CRLF is considered to be one line break. Line breaks are not 1173 included in the resulting list. */ 1174 1175PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1176 PyObject *s, /* String to split */ 1177 int keepends /* If true, line end markers are included */ 1178 ); 1179 1180/* Partition a string using a given separator. */ 1181 1182PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1183 PyObject *s, /* String to partition */ 1184 PyObject *sep /* String separator */ 1185 ); 1186 1187/* Partition a string using a given separator, searching from the end of the 1188 string. */ 1189 1190PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1191 PyObject *s, /* String to partition */ 1192 PyObject *sep /* String separator */ 1193 ); 1194 1195/* Split a string giving a list of Unicode strings. 1196 1197 If sep is NULL, splitting will be done at all whitespace 1198 substrings. Otherwise, splits occur at the given separator. 1199 1200 At most maxsplit splits will be done. But unlike PyUnicode_Split 1201 PyUnicode_RSplit splits from the end of the string. If negative, 1202 no limit is set. 1203 1204 Separators are not included in the resulting list. 1205 1206*/ 1207 1208PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1209 PyObject *s, /* String to split */ 1210 PyObject *sep, /* String separator */ 1211 Py_ssize_t maxsplit /* Maxsplit count */ 1212 ); 1213 1214/* Translate a string by applying a character mapping table to it and 1215 return the resulting Unicode object. 1216 1217 The mapping table must map Unicode ordinal integers to Unicode 1218 ordinal integers or None (causing deletion of the character). 1219 1220 Mapping tables may be dictionaries or sequences. Unmapped character 1221 ordinals (ones which cause a LookupError) are left untouched and 1222 are copied as-is. 1223 1224*/ 1225 1226PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1227 PyObject *str, /* String */ 1228 PyObject *table, /* Translate table */ 1229 const char *errors /* error handling */ 1230 ); 1231 1232/* Join a sequence of strings using the given separator and return 1233 the resulting Unicode string. */ 1234 1235PyAPI_FUNC(PyObject*) PyUnicode_Join( 1236 PyObject *separator, /* Separator string */ 1237 PyObject *seq /* Sequence object */ 1238 ); 1239 1240/* Return 1 if substr matches str[start:end] at the given tail end, 0 1241 otherwise. */ 1242 1243PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1244 PyObject *str, /* String */ 1245 PyObject *substr, /* Prefix or Suffix string */ 1246 Py_ssize_t start, /* Start index */ 1247 Py_ssize_t end, /* Stop index */ 1248 int direction /* Tail end: -1 prefix, +1 suffix */ 1249 ); 1250 1251/* Return the first position of substr in str[start:end] using the 1252 given search direction or -1 if not found. -2 is returned in case 1253 an error occurred and an exception is set. */ 1254 1255PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1256 PyObject *str, /* String */ 1257 PyObject *substr, /* Substring to find */ 1258 Py_ssize_t start, /* Start index */ 1259 Py_ssize_t end, /* Stop index */ 1260 int direction /* Find direction: +1 forward, -1 backward */ 1261 ); 1262 1263/* Count the number of occurrences of substr in str[start:end]. */ 1264 1265PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1266 PyObject *str, /* String */ 1267 PyObject *substr, /* Substring to count */ 1268 Py_ssize_t start, /* Start index */ 1269 Py_ssize_t end /* Stop index */ 1270 ); 1271 1272/* Replace at most maxcount occurrences of substr in str with replstr 1273 and return the resulting Unicode object. */ 1274 1275PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1276 PyObject *str, /* String */ 1277 PyObject *substr, /* Substring to find */ 1278 PyObject *replstr, /* Substring to replace */ 1279 Py_ssize_t maxcount /* Max. number of replacements to apply; 1280 -1 = all */ 1281 ); 1282 1283/* Compare two strings and return -1, 0, 1 for less than, equal, 1284 greater than resp. */ 1285 1286PyAPI_FUNC(int) PyUnicode_Compare( 1287 PyObject *left, /* Left string */ 1288 PyObject *right /* Right string */ 1289 ); 1290 1291PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1292 PyObject *left, 1293 const char *right 1294 ); 1295 1296/* Rich compare two strings and return one of the following: 1297 1298 - NULL in case an exception was raised 1299 - Py_True or Py_False for successfuly comparisons 1300 - Py_NotImplemented in case the type combination is unknown 1301 1302 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1303 case the conversion of the arguments to Unicode fails with a 1304 UnicodeDecodeError. 1305 1306 Possible values for op: 1307 1308 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1309 1310*/ 1311 1312PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1313 PyObject *left, /* Left string */ 1314 PyObject *right, /* Right string */ 1315 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1316 ); 1317 1318/* Apply a argument tuple or dictionary to a format string and return 1319 the resulting Unicode string. */ 1320 1321PyAPI_FUNC(PyObject *) PyUnicode_Format( 1322 PyObject *format, /* Format string */ 1323 PyObject *args /* Argument tuple or dictionary */ 1324 ); 1325 1326/* Checks whether element is contained in container and return 1/0 1327 accordingly. 1328 1329 element has to coerce to an one element Unicode string. -1 is 1330 returned in case of an error. */ 1331 1332PyAPI_FUNC(int) PyUnicode_Contains( 1333 PyObject *container, /* Container string */ 1334 PyObject *element /* Element string */ 1335 ); 1336 1337/* Checks whether argument is a valid identifier. */ 1338 1339PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1340 1341/* Externally visible for str.strip(unicode) */ 1342PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1343 PyUnicodeObject *self, 1344 int striptype, 1345 PyObject *sepobj 1346 ); 1347 1348/* === Characters Type APIs =============================================== */ 1349 1350/* These should not be used directly. Use the Py_UNICODE_IS* and 1351 Py_UNICODE_TO* macros instead. 1352 1353 These APIs are implemented in Objects/unicodectype.c. 1354 1355*/ 1356 1357PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1358 Py_UNICODE ch /* Unicode character */ 1359 ); 1360 1361PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1362 Py_UNICODE ch /* Unicode character */ 1363 ); 1364 1365PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1366 Py_UNICODE ch /* Unicode character */ 1367 ); 1368 1369PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1370 Py_UNICODE ch /* Unicode character */ 1371 ); 1372 1373PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1374 Py_UNICODE ch /* Unicode character */ 1375 ); 1376 1377PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1378 const Py_UNICODE ch /* Unicode character */ 1379 ); 1380 1381PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1382 const Py_UNICODE ch /* Unicode character */ 1383 ); 1384 1385PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase( 1386 Py_UNICODE ch /* Unicode character */ 1387 ); 1388 1389PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase( 1390 Py_UNICODE ch /* Unicode character */ 1391 ); 1392 1393PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase( 1394 Py_UNICODE ch /* Unicode character */ 1395 ); 1396 1397PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1398 Py_UNICODE ch /* Unicode character */ 1399 ); 1400 1401PyAPI_FUNC(int) _PyUnicode_ToDigit( 1402 Py_UNICODE ch /* Unicode character */ 1403 ); 1404 1405PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1406 Py_UNICODE ch /* Unicode character */ 1407 ); 1408 1409PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1410 Py_UNICODE ch /* Unicode character */ 1411 ); 1412 1413PyAPI_FUNC(int) _PyUnicode_IsDigit( 1414 Py_UNICODE ch /* Unicode character */ 1415 ); 1416 1417PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1418 Py_UNICODE ch /* Unicode character */ 1419 ); 1420 1421PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1422 Py_UNICODE ch /* Unicode character */ 1423 ); 1424 1425PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u); 1426 1427PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1428 Py_UNICODE *s1, const Py_UNICODE *s2); 1429 1430PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1431 Py_UNICODE *s1, const Py_UNICODE *s2, size_t n); 1432 1433PyAPI_FUNC(int) Py_UNICODE_strcmp( 1434 const Py_UNICODE *s1, const Py_UNICODE *s2); 1435 1436PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1437 const Py_UNICODE *s, Py_UNICODE c 1438 ); 1439 1440PyObject * 1441_unicodeformatter_iterator(PyObject *str); 1442PyObject * 1443_unicodeformatter_field_name_split(PyObject *field_name); 1444 1445#ifdef __cplusplus 1446} 1447#endif 1448#endif /* !Py_UNICODEOBJECT_H */ 1449