unicodeobject.h revision 00bc0e0a2d0b6c403a3c6ab96fa7d3398b5c751e
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4/* 5 6Unicode implementation based on original code by Fredrik Lundh, 7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 8Unicode Integration Proposal (see file Misc/unicode.txt). 9 10Copyright (c) Corporation for National Research Initiatives. 11 12 13 Original header: 14 -------------------------------------------------------------------- 15 16 * Yet another Unicode string type for Python. This type supports the 17 * 16-bit Basic Multilingual Plane (BMP) only. 18 * 19 * Written by Fredrik Lundh, January 1999. 20 * 21 * Copyright (c) 1999 by Secret Labs AB. 22 * Copyright (c) 1999 by Fredrik Lundh. 23 * 24 * fredrik@pythonware.com 25 * http://www.pythonware.com 26 * 27 * -------------------------------------------------------------------- 28 * This Unicode String Type is 29 * 30 * Copyright (c) 1999 by Secret Labs AB 31 * Copyright (c) 1999 by Fredrik Lundh 32 * 33 * By obtaining, using, and/or copying this software and/or its 34 * associated documentation, you agree that you have read, understood, 35 * and will comply with the following terms and conditions: 36 * 37 * Permission to use, copy, modify, and distribute this software and its 38 * associated documentation for any purpose and without fee is hereby 39 * granted, provided that the above copyright notice appears in all 40 * copies, and that both that copyright notice and this permission notice 41 * appear in supporting documentation, and that the name of Secret Labs 42 * AB or the author not be used in advertising or publicity pertaining to 43 * distribution of the software without specific, written prior 44 * permission. 45 * 46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 53 * -------------------------------------------------------------------- */ 54 55#include <ctype.h> 56 57/* === Internal API ======================================================= */ 58 59/* --- Internal Unicode Format -------------------------------------------- */ 60 61/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 62 properly set, but the default rules below doesn't set it. I'll 63 sort this out some other day -- fredrik@pythonware.com */ 64 65#ifndef Py_UNICODE_SIZE 66#error Must define Py_UNICODE_SIZE 67#endif 68 69/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 70 strings are stored as UCS-2 (with limited support for UTF-16) */ 71 72#if Py_UNICODE_SIZE >= 4 73#define Py_UNICODE_WIDE 74#endif 75 76/* Set these flags if the platform has "wchar.h", "wctype.h" and the 77 wchar_t type is a 16-bit unsigned type */ 78/* #define HAVE_WCHAR_H */ 79/* #define HAVE_USABLE_WCHAR_T */ 80 81/* Defaults for various platforms */ 82#ifndef PY_UNICODE_TYPE 83 84/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 85# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 86# define HAVE_USABLE_WCHAR_T 87# define PY_UNICODE_TYPE wchar_t 88# endif 89 90# if defined(Py_UNICODE_WIDE) 91# define PY_UNICODE_TYPE Py_UCS4 92# endif 93 94#endif 95 96/* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar() and 98 PyUnicode_AsWideChar(). */ 99 100#ifdef HAVE_USABLE_WCHAR_T 101# ifndef HAVE_WCHAR_H 102# define HAVE_WCHAR_H 103# endif 104#endif 105 106#ifdef HAVE_WCHAR_H 107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 108# ifdef _HAVE_BSDI 109# include <time.h> 110# endif 111# include <wchar.h> 112#endif 113 114/* 115 * Use this typedef when you need to represent a UTF-16 surrogate pair 116 * as single unsigned integer. 117 */ 118#if SIZEOF_INT >= 4 119typedef unsigned int Py_UCS4; 120#elif SIZEOF_LONG >= 4 121typedef unsigned long Py_UCS4; 122#endif 123 124typedef PY_UNICODE_TYPE Py_UNICODE; 125 126/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 127 128/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 129 produce different external names and thus cause import errors in 130 case Python interpreters and extensions with mixed compiled in 131 Unicode width assumptions are combined. */ 132 133#ifndef Py_UNICODE_WIDE 134 135# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 136# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 137# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject 138# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 139# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 140# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 141# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String 142# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 143# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 144# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 145# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 146# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 147# define PyUnicode_Compare PyUnicodeUCS2_Compare 148# define PyUnicode_Concat PyUnicodeUCS2_Concat 149# define PyUnicode_Append PyUnicodeUCS2_Append 150# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel 151# define PyUnicode_Contains PyUnicodeUCS2_Contains 152# define PyUnicode_Count PyUnicodeUCS2_Count 153# define PyUnicode_Decode PyUnicodeUCS2_Decode 154# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 155# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 156# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 157# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault 158# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 159# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 160# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful 161# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 162# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful 163# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 164# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful 165# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 166# define PyUnicode_Encode PyUnicodeUCS2_Encode 167# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 168# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 169# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 170# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 171# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 172# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 173# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 174# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 175# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 176# define PyUnicode_Find PyUnicodeUCS2_Find 177# define PyUnicode_Format PyUnicodeUCS2_Format 178# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 179# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 180# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal 181# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 182# define PyUnicode_FromString PyUnicodeUCS2_FromString 183# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize 184# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV 185# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat 186# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 187# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 188# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 189# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 190# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier 191# define PyUnicode_Join PyUnicodeUCS2_Join 192# define PyUnicode_Partition PyUnicodeUCS2_Partition 193# define PyUnicode_RPartition PyUnicodeUCS2_RPartition 194# define PyUnicode_RSplit PyUnicodeUCS2_RSplit 195# define PyUnicode_Replace PyUnicodeUCS2_Replace 196# define PyUnicode_Resize PyUnicodeUCS2_Resize 197# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare 198# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding 199# define PyUnicode_Split PyUnicodeUCS2_Split 200# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 201# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 202# define PyUnicode_Translate PyUnicodeUCS2_Translate 203# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 204# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 205# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 206# define _PyUnicode_Init _PyUnicodeUCS2_Init 207# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha 208# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit 209# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit 210# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak 211# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase 212# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric 213# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase 214# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart 215# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue 216# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase 217# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace 218# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit 219# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit 220# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase 221# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric 222# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase 223# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase 224 225#else 226 227# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 228# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 229# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject 230# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 231# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 232# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 233# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String 234# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 235# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 236# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 237# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 238# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 239# define PyUnicode_Compare PyUnicodeUCS4_Compare 240# define PyUnicode_Concat PyUnicodeUCS4_Concat 241# define PyUnicode_Append PyUnicodeUCS4_Append 242# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel 243# define PyUnicode_Contains PyUnicodeUCS4_Contains 244# define PyUnicode_Count PyUnicodeUCS4_Count 245# define PyUnicode_Decode PyUnicodeUCS4_Decode 246# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 247# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 248# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 249# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault 250# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 251# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 252# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful 253# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 254# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful 255# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 256# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful 257# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 258# define PyUnicode_Encode PyUnicodeUCS4_Encode 259# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 260# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 261# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 262# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 263# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 264# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 265# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 266# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 267# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 268# define PyUnicode_Find PyUnicodeUCS4_Find 269# define PyUnicode_Format PyUnicodeUCS4_Format 270# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 271# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 272# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal 273# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 274# define PyUnicode_FromString PyUnicodeUCS4_FromString 275# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize 276# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV 277# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat 278# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 279# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 280# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 281# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 282# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier 283# define PyUnicode_Join PyUnicodeUCS4_Join 284# define PyUnicode_Partition PyUnicodeUCS4_Partition 285# define PyUnicode_RPartition PyUnicodeUCS4_RPartition 286# define PyUnicode_RSplit PyUnicodeUCS4_RSplit 287# define PyUnicode_Replace PyUnicodeUCS4_Replace 288# define PyUnicode_Resize PyUnicodeUCS4_Resize 289# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare 290# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding 291# define PyUnicode_Split PyUnicodeUCS4_Split 292# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 293# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 294# define PyUnicode_Translate PyUnicodeUCS4_Translate 295# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 296# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 297# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 298# define _PyUnicode_Init _PyUnicodeUCS4_Init 299# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha 300# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit 301# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit 302# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak 303# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase 304# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric 305# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase 306# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart 307# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue 308# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase 309# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace 310# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit 311# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit 312# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase 313# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric 314# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase 315# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase 316 317 318#endif 319 320/* --- Internal Unicode Operations ---------------------------------------- */ 321 322/* If you want Python to use the compiler's wctype.h functions instead 323 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or 324 configure Python using --with-wctype-functions. This reduces the 325 interpreter's code size. */ 326 327#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) 328 329#include <wctype.h> 330 331#define Py_UNICODE_ISSPACE(ch) iswspace(ch) 332 333#define Py_UNICODE_ISLOWER(ch) iswlower(ch) 334#define Py_UNICODE_ISUPPER(ch) iswupper(ch) 335#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 336#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 337 338#define Py_UNICODE_TOLOWER(ch) towlower(ch) 339#define Py_UNICODE_TOUPPER(ch) towupper(ch) 340#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 341 342#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 343#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 344#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 345 346#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 347#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 348#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 349 350#define Py_UNICODE_ISALPHA(ch) iswalpha(ch) 351 352#else 353 354#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) 355 356#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 357#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 358#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 359#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 360 361#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 362#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 363#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 364 365#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 366#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 367#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 368 369#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 370#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 371#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 372 373#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 374 375#endif 376 377#define Py_UNICODE_ISALNUM(ch) \ 378 (Py_UNICODE_ISALPHA(ch) || \ 379 Py_UNICODE_ISDECIMAL(ch) || \ 380 Py_UNICODE_ISDIGIT(ch) || \ 381 Py_UNICODE_ISNUMERIC(ch)) 382 383#define Py_UNICODE_COPY(target, source, length) \ 384 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE)) 385 386#define Py_UNICODE_FILL(target, value, length) do\ 387 {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 388 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 389 } while (0) 390 391/* check if substring matches at given offset. the offset must be 392 valid, and the substring must not be empty */ 393#define Py_UNICODE_MATCH(string, offset, substring) \ 394 ((*((string)->str + (offset)) == *((substring)->str)) && \ 395 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \ 396 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE))) 397 398#ifdef __cplusplus 399extern "C" { 400#endif 401 402/* --- Unicode Type ------------------------------------------------------- */ 403 404typedef struct { 405 PyObject_HEAD 406 Py_ssize_t length; /* Length of raw Unicode data in buffer */ 407 Py_UNICODE *str; /* Raw Unicode buffer */ 408 long hash; /* Hash value; -1 if not set */ 409 int state; /* != 0 if interned. In this case the two 410 * references from the dictionary to this object 411 * are *not* counted in ob_refcnt. */ 412 PyObject *defenc; /* (Default) Encoded version as Python 413 string, or NULL; this is used for 414 implementing the buffer protocol */ 415} PyUnicodeObject; 416 417PyAPI_DATA(PyTypeObject) PyUnicode_Type; 418 419#define SSTATE_NOT_INTERNED 0 420#define SSTATE_INTERNED_MORTAL 1 421#define SSTATE_INTERNED_IMMORTAL 2 422 423#define PyUnicode_Check(op) \ 424 PyType_FastSubclass(Py_Type(op), Py_TPFLAGS_UNICODE_SUBCLASS) 425#define PyUnicode_CheckExact(op) (Py_Type(op) == &PyUnicode_Type) 426 427/* Fast access macros */ 428#define PyUnicode_GET_SIZE(op) \ 429 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length)) 430#define PyUnicode_GET_DATA_SIZE(op) \ 431 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))) 432#define PyUnicode_AS_UNICODE(op) \ 433 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str)) 434#define PyUnicode_AS_DATA(op) \ 435 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str)) 436 437/* --- Constants ---------------------------------------------------------- */ 438 439/* This Unicode character will be used as replacement character during 440 decoding if the errors argument is set to "replace". Note: the 441 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 442 Unicode 3.0. */ 443 444#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 445 446/* === Public API ========================================================= */ 447 448/* --- Plain Py_UNICODE --------------------------------------------------- */ 449 450/* Create a Unicode Object from the Py_UNICODE buffer u of the given 451 size. 452 453 u may be NULL which causes the contents to be undefined. It is the 454 user's responsibility to fill in the needed data afterwards. Note 455 that modifying the Unicode object contents after construction is 456 only allowed if u was set to NULL. 457 458 The buffer is copied into the new object. */ 459 460PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 461 const Py_UNICODE *u, /* Unicode buffer */ 462 Py_ssize_t size /* size of buffer */ 463 ); 464 465/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */ 466PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 467 const char *u, /* char buffer */ 468 Py_ssize_t size /* size of buffer */ 469 ); 470 471/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 472 Latin-1 encoded bytes */ 473PyAPI_FUNC(PyObject*) PyUnicode_FromString( 474 const char *u /* string */ 475 ); 476 477/* Return a read-only pointer to the Unicode object's internal 478 Py_UNICODE buffer. */ 479 480PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 481 PyObject *unicode /* Unicode object */ 482 ); 483 484/* Get the length of the Unicode object. */ 485 486PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 487 PyObject *unicode /* Unicode object */ 488 ); 489 490/* Get the maximum ordinal for a Unicode character. */ 491PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 492 493/* Resize an already allocated Unicode object to the new size length. 494 495 *unicode is modified to point to the new (resized) object and 0 496 returned on success. 497 498 This API may only be called by the function which also called the 499 Unicode constructor. The refcount on the object must be 1. Otherwise, 500 an error is returned. 501 502 Error handling is implemented as follows: an exception is set, -1 503 is returned and *unicode left untouched. 504 505*/ 506 507PyAPI_FUNC(int) PyUnicode_Resize( 508 PyObject **unicode, /* Pointer to the Unicode object */ 509 Py_ssize_t length /* New length */ 510 ); 511 512/* Coerce obj to an Unicode object and return a reference with 513 *incremented* refcount. 514 515 Coercion is done in the following way: 516 517 1. String and other char buffer compatible objects are decoded 518 under the assumptions that they contain data using the current 519 default encoding. Decoding is done in "strict" mode. 520 521 2. All other objects (including Unicode objects) raise an 522 exception. 523 524 The API returns NULL in case of an error. The caller is responsible 525 for decref'ing the returned objects. 526 527*/ 528 529PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 530 register PyObject *obj, /* Object */ 531 const char *encoding, /* encoding */ 532 const char *errors /* error handling */ 533 ); 534 535/* Coerce obj to an Unicode object and return a reference with 536 *incremented* refcount. 537 538 Unicode objects are passed back as-is (subclasses are converted to 539 true Unicode objects), all other objects are delegated to 540 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 541 using the default encoding as basis for decoding the object. 542 543 The API returns NULL in case of an error. The caller is responsible 544 for decref'ing the returned objects. 545 546*/ 547 548PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 549 register PyObject *obj /* Object */ 550 ); 551 552PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list); 553PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...); 554 555PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 556PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 557PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *); 558PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 559 560/* Use only if you know it's a string */ 561#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state) 562 563/* --- wchar_t support for platforms which support it --------------------- */ 564 565#ifdef HAVE_WCHAR_H 566 567/* Create a Unicode Object from the whcar_t buffer w of the given 568 size. 569 570 The buffer is copied into the new object. */ 571 572PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 573 register const wchar_t *w, /* wchar_t buffer */ 574 Py_ssize_t size /* size of buffer */ 575 ); 576 577/* Copies the Unicode Object contents into the wchar_t buffer w. At 578 most size wchar_t characters are copied. 579 580 Note that the resulting wchar_t string may or may not be 581 0-terminated. It is the responsibility of the caller to make sure 582 that the wchar_t string is 0-terminated in case this is required by 583 the application. 584 585 Returns the number of wchar_t characters copied (excluding a 586 possibly trailing 0-termination character) or -1 in case of an 587 error. */ 588 589PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 590 PyUnicodeObject *unicode, /* Unicode object */ 591 register wchar_t *w, /* wchar_t buffer */ 592 Py_ssize_t size /* size of buffer */ 593 ); 594 595#endif 596 597/* --- Unicode ordinals --------------------------------------------------- */ 598 599/* Create a Unicode Object from the given Unicode code point ordinal. 600 601 The ordinal must be in range(0x10000) on narrow Python builds 602 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is 603 raised in case it is not. 604 605*/ 606 607PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 608 609/* === Builtin Codecs ===================================================== 610 611 Many of these APIs take two arguments encoding and errors. These 612 parameters encoding and errors have the same semantics as the ones 613 of the builtin unicode() API. 614 615 Setting encoding to NULL causes the default encoding to be used. 616 617 Error handling is set by errors which may also be set to NULL 618 meaning to use the default handling defined for the codec. Default 619 error handling for all builtin codecs is "strict" (ValueErrors are 620 raised). 621 622 The codecs all use a similar interface. Only deviation from the 623 generic ones are documented. 624 625*/ 626 627/* --- Manage the default encoding ---------------------------------------- */ 628 629/* Return a Python string holding the default encoded value of the 630 Unicode object. 631 632 The resulting string is cached in the Unicode object for subsequent 633 usage by this function. The cached version is needed to implement 634 the character buffer interface and will live (at least) as long as 635 the Unicode object itself. 636 637 The refcount of the string is *not* incremented. 638 639 *** Exported for internal use by the interpreter only !!! *** 640 641*/ 642 643PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( 644 PyObject *, const char *); 645 646/* Decode a null-terminated string using Py_FileSystemDefaultEncoding. 647 648 If the encoding is supported by one of the built-in codecs (i.e., UTF-8, 649 UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace 650 invalid characters with '?'. 651 652 The function is intended to be used for paths and file names only 653 during bootstrapping process where the codecs are not set up. 654*/ 655 656PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 657 const char *s /* encoded string */ 658 ); 659 660/* Return a char* holding the UTF-8 encoded value of the 661 Unicode object. 662 663 DEPRECATED: use PyUnicode_AsStringAndSize() instead. 664*/ 665 666PyAPI_FUNC(char *) PyUnicode_AsStringAndSize(PyObject*, Py_ssize_t *); 667 668/* Returns the UTF-8 encoding, and its size. 669 670 If the output argument is NULL, no size is stored. 671 */ 672 673PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*); 674 675/* Returns the UTF-8 encoding. 676 677 This is equivalent to PyUnicode_AsStringAndSize(x, NULL). 678 679 */ 680 681PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 682 683/* Sets the currently active default encoding. 684 685 Returns 0 on success, -1 in case of an error. 686 687 */ 688 689PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding( 690 const char *encoding /* Encoding name in standard form */ 691 ); 692 693/* --- Generic Codecs ----------------------------------------------------- */ 694 695/* Create a Unicode object by decoding the encoded string s of the 696 given size. */ 697 698PyAPI_FUNC(PyObject*) PyUnicode_Decode( 699 const char *s, /* encoded string */ 700 Py_ssize_t size, /* size of buffer */ 701 const char *encoding, /* encoding */ 702 const char *errors /* error handling */ 703 ); 704 705/* Encodes a Py_UNICODE buffer of the given size and returns a 706 Python string object. */ 707 708PyAPI_FUNC(PyObject*) PyUnicode_Encode( 709 const Py_UNICODE *s, /* Unicode char buffer */ 710 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 711 const char *encoding, /* encoding */ 712 const char *errors /* error handling */ 713 ); 714 715/* Encodes a Unicode object and returns the result as Python 716 object. */ 717 718PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 719 PyObject *unicode, /* Unicode object */ 720 const char *encoding, /* encoding */ 721 const char *errors /* error handling */ 722 ); 723 724/* Encodes a Unicode object and returns the result as Python string 725 object. */ 726 727PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 728 PyObject *unicode, /* Unicode object */ 729 const char *encoding, /* encoding */ 730 const char *errors /* error handling */ 731 ); 732 733PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 734 PyObject* string /* 256 character map */ 735 ); 736 737 738/* --- UTF-7 Codecs ------------------------------------------------------- */ 739 740PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 741 const char *string, /* UTF-7 encoded string */ 742 Py_ssize_t length, /* size of string */ 743 const char *errors /* error handling */ 744 ); 745 746PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 747 const Py_UNICODE *data, /* Unicode char buffer */ 748 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 749 int encodeSetO, /* force the encoder to encode characters in 750 Set O, as described in RFC2152 */ 751 int encodeWhiteSpace, /* force the encoder to encode space, tab, 752 carriage return and linefeed characters */ 753 const char *errors /* error handling */ 754 ); 755 756/* --- UTF-8 Codecs ------------------------------------------------------- */ 757 758PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 759 const char *string, /* UTF-8 encoded string */ 760 Py_ssize_t length, /* size of string */ 761 const char *errors /* error handling */ 762 ); 763 764PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 765 const char *string, /* UTF-8 encoded string */ 766 Py_ssize_t length, /* size of string */ 767 const char *errors, /* error handling */ 768 Py_ssize_t *consumed /* bytes consumed */ 769 ); 770 771PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 772 PyObject *unicode /* Unicode object */ 773 ); 774 775PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 776 const Py_UNICODE *data, /* Unicode char buffer */ 777 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 778 const char *errors /* error handling */ 779 ); 780 781/* --- UTF-32 Codecs ------------------------------------------------------ */ 782 783/* Decodes length bytes from a UTF-32 encoded buffer string and returns 784 the corresponding Unicode object. 785 786 errors (if non-NULL) defines the error handling. It defaults 787 to "strict". 788 789 If byteorder is non-NULL, the decoder starts decoding using the 790 given byte order: 791 792 *byteorder == -1: little endian 793 *byteorder == 0: native order 794 *byteorder == 1: big endian 795 796 In native mode, the first four bytes of the stream are checked for a 797 BOM mark. If found, the BOM mark is analysed, the byte order 798 adjusted and the BOM skipped. In the other modes, no BOM mark 799 interpretation is done. After completion, *byteorder is set to the 800 current byte order at the end of input data. 801 802 If byteorder is NULL, the codec starts in native order mode. 803 804*/ 805 806PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 807 const char *string, /* UTF-32 encoded string */ 808 Py_ssize_t length, /* size of string */ 809 const char *errors, /* error handling */ 810 int *byteorder /* pointer to byteorder to use 811 0=native;-1=LE,1=BE; updated on 812 exit */ 813 ); 814 815PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 816 const char *string, /* UTF-32 encoded string */ 817 Py_ssize_t length, /* size of string */ 818 const char *errors, /* error handling */ 819 int *byteorder, /* pointer to byteorder to use 820 0=native;-1=LE,1=BE; updated on 821 exit */ 822 Py_ssize_t *consumed /* bytes consumed */ 823 ); 824 825/* Returns a Python string using the UTF-32 encoding in native byte 826 order. The string always starts with a BOM mark. */ 827 828PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 829 PyObject *unicode /* Unicode object */ 830 ); 831 832/* Returns a Python string object holding the UTF-32 encoded value of 833 the Unicode data. 834 835 If byteorder is not 0, output is written according to the following 836 byte order: 837 838 byteorder == -1: little endian 839 byteorder == 0: native byte order (writes a BOM mark) 840 byteorder == 1: big endian 841 842 If byteorder is 0, the output string will always start with the 843 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 844 prepended. 845 846*/ 847 848PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 849 const Py_UNICODE *data, /* Unicode char buffer */ 850 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 851 const char *errors, /* error handling */ 852 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 853 ); 854 855/* --- UTF-16 Codecs ------------------------------------------------------ */ 856 857/* Decodes length bytes from a UTF-16 encoded buffer string and returns 858 the corresponding Unicode object. 859 860 errors (if non-NULL) defines the error handling. It defaults 861 to "strict". 862 863 If byteorder is non-NULL, the decoder starts decoding using the 864 given byte order: 865 866 *byteorder == -1: little endian 867 *byteorder == 0: native order 868 *byteorder == 1: big endian 869 870 In native mode, the first two bytes of the stream are checked for a 871 BOM mark. If found, the BOM mark is analysed, the byte order 872 adjusted and the BOM skipped. In the other modes, no BOM mark 873 interpretation is done. After completion, *byteorder is set to the 874 current byte order at the end of input data. 875 876 If byteorder is NULL, the codec starts in native order mode. 877 878*/ 879 880PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 881 const char *string, /* UTF-16 encoded string */ 882 Py_ssize_t length, /* size of string */ 883 const char *errors, /* error handling */ 884 int *byteorder /* pointer to byteorder to use 885 0=native;-1=LE,1=BE; updated on 886 exit */ 887 ); 888 889PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 890 const char *string, /* UTF-16 encoded string */ 891 Py_ssize_t length, /* size of string */ 892 const char *errors, /* error handling */ 893 int *byteorder, /* pointer to byteorder to use 894 0=native;-1=LE,1=BE; updated on 895 exit */ 896 Py_ssize_t *consumed /* bytes consumed */ 897 ); 898 899/* Returns a Python string using the UTF-16 encoding in native byte 900 order. The string always starts with a BOM mark. */ 901 902PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 903 PyObject *unicode /* Unicode object */ 904 ); 905 906/* Returns a Python string object holding the UTF-16 encoded value of 907 the Unicode data. 908 909 If byteorder is not 0, output is written according to the following 910 byte order: 911 912 byteorder == -1: little endian 913 byteorder == 0: native byte order (writes a BOM mark) 914 byteorder == 1: big endian 915 916 If byteorder is 0, the output string will always start with the 917 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 918 prepended. 919 920 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 921 UCS-2. This trick makes it possible to add full UTF-16 capabilities 922 at a later point without compromising the APIs. 923 924*/ 925 926PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 927 const Py_UNICODE *data, /* Unicode char buffer */ 928 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 929 const char *errors, /* error handling */ 930 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 931 ); 932 933/* --- Unicode-Escape Codecs ---------------------------------------------- */ 934 935PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 936 const char *string, /* Unicode-Escape encoded string */ 937 Py_ssize_t length, /* size of string */ 938 const char *errors /* error handling */ 939 ); 940 941PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 942 PyObject *unicode /* Unicode object */ 943 ); 944 945PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 946 const Py_UNICODE *data, /* Unicode char buffer */ 947 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 948 ); 949 950/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 951 952PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 953 const char *string, /* Raw-Unicode-Escape encoded string */ 954 Py_ssize_t length, /* size of string */ 955 const char *errors /* error handling */ 956 ); 957 958PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 959 PyObject *unicode /* Unicode object */ 960 ); 961 962PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 963 const Py_UNICODE *data, /* Unicode char buffer */ 964 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 965 ); 966 967/* --- Unicode Internal Codec --------------------------------------------- 968 969 Only for internal use in _codecsmodule.c */ 970 971PyObject *_PyUnicode_DecodeUnicodeInternal( 972 const char *string, 973 Py_ssize_t length, 974 const char *errors 975 ); 976 977/* --- Latin-1 Codecs ----------------------------------------------------- 978 979 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 980 981*/ 982 983PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 984 const char *string, /* Latin-1 encoded string */ 985 Py_ssize_t length, /* size of string */ 986 const char *errors /* error handling */ 987 ); 988 989PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 990 PyObject *unicode /* Unicode object */ 991 ); 992 993PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 994 const Py_UNICODE *data, /* Unicode char buffer */ 995 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 996 const char *errors /* error handling */ 997 ); 998 999/* --- ASCII Codecs ------------------------------------------------------- 1000 1001 Only 7-bit ASCII data is excepted. All other codes generate errors. 1002 1003*/ 1004 1005PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1006 const char *string, /* ASCII encoded string */ 1007 Py_ssize_t length, /* size of string */ 1008 const char *errors /* error handling */ 1009 ); 1010 1011PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1012 PyObject *unicode /* Unicode object */ 1013 ); 1014 1015PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1016 const Py_UNICODE *data, /* Unicode char buffer */ 1017 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1018 const char *errors /* error handling */ 1019 ); 1020 1021/* --- Character Map Codecs ----------------------------------------------- 1022 1023 This codec uses mappings to encode and decode characters. 1024 1025 Decoding mappings must map single string characters to single 1026 Unicode characters, integers (which are then interpreted as Unicode 1027 ordinals) or None (meaning "undefined mapping" and causing an 1028 error). 1029 1030 Encoding mappings must map single Unicode characters to single 1031 string characters, integers (which are then interpreted as Latin-1 1032 ordinals) or None (meaning "undefined mapping" and causing an 1033 error). 1034 1035 If a character lookup fails with a LookupError, the character is 1036 copied as-is meaning that its ordinal value will be interpreted as 1037 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1038 to contain those mappings which map characters to different code 1039 points. 1040 1041*/ 1042 1043PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1044 const char *string, /* Encoded string */ 1045 Py_ssize_t length, /* size of string */ 1046 PyObject *mapping, /* character mapping 1047 (char ordinal -> unicode ordinal) */ 1048 const char *errors /* error handling */ 1049 ); 1050 1051PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1052 PyObject *unicode, /* Unicode object */ 1053 PyObject *mapping /* character mapping 1054 (unicode ordinal -> char ordinal) */ 1055 ); 1056 1057PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1058 const Py_UNICODE *data, /* Unicode char buffer */ 1059 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1060 PyObject *mapping, /* character mapping 1061 (unicode ordinal -> char ordinal) */ 1062 const char *errors /* error handling */ 1063 ); 1064 1065/* Translate a Py_UNICODE buffer of the given length by applying a 1066 character mapping table to it and return the resulting Unicode 1067 object. 1068 1069 The mapping table must map Unicode ordinal integers to Unicode 1070 ordinal integers or None (causing deletion of the character). 1071 1072 Mapping tables may be dictionaries or sequences. Unmapped character 1073 ordinals (ones which cause a LookupError) are left untouched and 1074 are copied as-is. 1075 1076*/ 1077 1078PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1079 const Py_UNICODE *data, /* Unicode char buffer */ 1080 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1081 PyObject *table, /* Translate table */ 1082 const char *errors /* error handling */ 1083 ); 1084 1085#ifdef MS_WIN32 1086 1087/* --- MBCS codecs for Windows -------------------------------------------- */ 1088 1089PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1090 const char *string, /* MBCS encoded string */ 1091 Py_ssize_t length, /* size of string */ 1092 const char *errors /* error handling */ 1093 ); 1094 1095PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1096 const char *string, /* MBCS encoded string */ 1097 Py_ssize_t length, /* size of string */ 1098 const char *errors, /* error handling */ 1099 Py_ssize_t *consumed /* bytes consumed */ 1100 ); 1101 1102PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1103 PyObject *unicode /* Unicode object */ 1104 ); 1105 1106PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1107 const Py_UNICODE *data, /* Unicode char buffer */ 1108 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1109 const char *errors /* error handling */ 1110 ); 1111 1112#endif /* MS_WIN32 */ 1113 1114/* --- Decimal Encoder ---------------------------------------------------- */ 1115 1116/* Takes a Unicode string holding a decimal value and writes it into 1117 an output buffer using standard ASCII digit codes. 1118 1119 The output buffer has to provide at least length+1 bytes of storage 1120 area. The output string is 0-terminated. 1121 1122 The encoder converts whitespace to ' ', decimal characters to their 1123 corresponding ASCII digit and all other Latin-1 characters except 1124 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1125 are treated as errors. This includes embedded NULL bytes. 1126 1127 Error handling is defined by the errors argument: 1128 1129 NULL or "strict": raise a ValueError 1130 "ignore": ignore the wrong characters (these are not copied to the 1131 output buffer) 1132 "replace": replaces illegal characters with '?' 1133 1134 Returns 0 on success, -1 on failure. 1135 1136*/ 1137 1138PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1139 Py_UNICODE *s, /* Unicode buffer */ 1140 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1141 char *output, /* Output buffer; must have size >= length */ 1142 const char *errors /* error handling */ 1143 ); 1144 1145/* --- Methods & Slots ---------------------------------------------------- 1146 1147 These are capable of handling Unicode objects and strings on input 1148 (we refer to them as strings in the descriptions) and return 1149 Unicode objects or integers as apporpriate. */ 1150 1151/* Concat two strings giving a new Unicode string. */ 1152 1153PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1154 PyObject *left, /* Left string */ 1155 PyObject *right /* Right string */ 1156 ); 1157 1158/* Concat two strings and put the result in *pleft 1159 (sets *pleft to NULL on error) */ 1160 1161PyAPI_FUNC(void) PyUnicode_Append( 1162 PyObject **pleft, /* Pointer to left string */ 1163 PyObject *right /* Right string */ 1164 ); 1165 1166/* Concat two strings, put the result in *pleft and drop the right object 1167 (sets *pleft to NULL on error) */ 1168 1169PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1170 PyObject **pleft, /* Pointer to left string */ 1171 PyObject *right /* Right string */ 1172 ); 1173 1174/* Split a string giving a list of Unicode strings. 1175 1176 If sep is NULL, splitting will be done at all whitespace 1177 substrings. Otherwise, splits occur at the given separator. 1178 1179 At most maxsplit splits will be done. If negative, no limit is set. 1180 1181 Separators are not included in the resulting list. 1182 1183*/ 1184 1185PyAPI_FUNC(PyObject*) PyUnicode_Split( 1186 PyObject *s, /* String to split */ 1187 PyObject *sep, /* String separator */ 1188 Py_ssize_t maxsplit /* Maxsplit count */ 1189 ); 1190 1191/* Dito, but split at line breaks. 1192 1193 CRLF is considered to be one line break. Line breaks are not 1194 included in the resulting list. */ 1195 1196PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1197 PyObject *s, /* String to split */ 1198 int keepends /* If true, line end markers are included */ 1199 ); 1200 1201/* Partition a string using a given separator. */ 1202 1203PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1204 PyObject *s, /* String to partition */ 1205 PyObject *sep /* String separator */ 1206 ); 1207 1208/* Partition a string using a given separator, searching from the end of the 1209 string. */ 1210 1211PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1212 PyObject *s, /* String to partition */ 1213 PyObject *sep /* String separator */ 1214 ); 1215 1216/* Split a string giving a list of Unicode strings. 1217 1218 If sep is NULL, splitting will be done at all whitespace 1219 substrings. Otherwise, splits occur at the given separator. 1220 1221 At most maxsplit splits will be done. But unlike PyUnicode_Split 1222 PyUnicode_RSplit splits from the end of the string. If negative, 1223 no limit is set. 1224 1225 Separators are not included in the resulting list. 1226 1227*/ 1228 1229PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1230 PyObject *s, /* String to split */ 1231 PyObject *sep, /* String separator */ 1232 Py_ssize_t maxsplit /* Maxsplit count */ 1233 ); 1234 1235/* Translate a string by applying a character mapping table to it and 1236 return the resulting Unicode object. 1237 1238 The mapping table must map Unicode ordinal integers to Unicode 1239 ordinal integers or None (causing deletion of the character). 1240 1241 Mapping tables may be dictionaries or sequences. Unmapped character 1242 ordinals (ones which cause a LookupError) are left untouched and 1243 are copied as-is. 1244 1245*/ 1246 1247PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1248 PyObject *str, /* String */ 1249 PyObject *table, /* Translate table */ 1250 const char *errors /* error handling */ 1251 ); 1252 1253/* Join a sequence of strings using the given separator and return 1254 the resulting Unicode string. */ 1255 1256PyAPI_FUNC(PyObject*) PyUnicode_Join( 1257 PyObject *separator, /* Separator string */ 1258 PyObject *seq /* Sequence object */ 1259 ); 1260 1261/* Return 1 if substr matches str[start:end] at the given tail end, 0 1262 otherwise. */ 1263 1264PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1265 PyObject *str, /* String */ 1266 PyObject *substr, /* Prefix or Suffix string */ 1267 Py_ssize_t start, /* Start index */ 1268 Py_ssize_t end, /* Stop index */ 1269 int direction /* Tail end: -1 prefix, +1 suffix */ 1270 ); 1271 1272/* Return the first position of substr in str[start:end] using the 1273 given search direction or -1 if not found. -2 is returned in case 1274 an error occurred and an exception is set. */ 1275 1276PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 1277 PyObject *str, /* String */ 1278 PyObject *substr, /* Substring to find */ 1279 Py_ssize_t start, /* Start index */ 1280 Py_ssize_t end, /* Stop index */ 1281 int direction /* Find direction: +1 forward, -1 backward */ 1282 ); 1283 1284/* Count the number of occurrences of substr in str[start:end]. */ 1285 1286PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 1287 PyObject *str, /* String */ 1288 PyObject *substr, /* Substring to count */ 1289 Py_ssize_t start, /* Start index */ 1290 Py_ssize_t end /* Stop index */ 1291 ); 1292 1293/* Replace at most maxcount occurrences of substr in str with replstr 1294 and return the resulting Unicode object. */ 1295 1296PyAPI_FUNC(PyObject *) PyUnicode_Replace( 1297 PyObject *str, /* String */ 1298 PyObject *substr, /* Substring to find */ 1299 PyObject *replstr, /* Substring to replace */ 1300 Py_ssize_t maxcount /* Max. number of replacements to apply; 1301 -1 = all */ 1302 ); 1303 1304/* Compare two strings and return -1, 0, 1 for less than, equal, 1305 greater than resp. */ 1306 1307PyAPI_FUNC(int) PyUnicode_Compare( 1308 PyObject *left, /* Left string */ 1309 PyObject *right /* Right string */ 1310 ); 1311 1312PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 1313 PyObject *left, 1314 const char *right 1315 ); 1316 1317/* Rich compare two strings and return one of the following: 1318 1319 - NULL in case an exception was raised 1320 - Py_True or Py_False for successfuly comparisons 1321 - Py_NotImplemented in case the type combination is unknown 1322 1323 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in 1324 case the conversion of the arguments to Unicode fails with a 1325 UnicodeDecodeError. 1326 1327 Possible values for op: 1328 1329 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1330 1331*/ 1332 1333PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1334 PyObject *left, /* Left string */ 1335 PyObject *right, /* Right string */ 1336 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1337 ); 1338 1339/* Apply a argument tuple or dictionary to a format string and return 1340 the resulting Unicode string. */ 1341 1342PyAPI_FUNC(PyObject *) PyUnicode_Format( 1343 PyObject *format, /* Format string */ 1344 PyObject *args /* Argument tuple or dictionary */ 1345 ); 1346 1347/* Checks whether element is contained in container and return 1/0 1348 accordingly. 1349 1350 element has to coerce to an one element Unicode string. -1 is 1351 returned in case of an error. */ 1352 1353PyAPI_FUNC(int) PyUnicode_Contains( 1354 PyObject *container, /* Container string */ 1355 PyObject *element /* Element string */ 1356 ); 1357 1358/* Checks whether argument is a valid identifier. */ 1359 1360PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1361 1362/* Externally visible for str.strip(unicode) */ 1363PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 1364 PyUnicodeObject *self, 1365 int striptype, 1366 PyObject *sepobj 1367 ); 1368 1369/* === Characters Type APIs =============================================== */ 1370 1371/* These should not be used directly. Use the Py_UNICODE_IS* and 1372 Py_UNICODE_TO* macros instead. 1373 1374 These APIs are implemented in Objects/unicodectype.c. 1375 1376*/ 1377 1378PyAPI_FUNC(int) _PyUnicode_IsLowercase( 1379 Py_UNICODE ch /* Unicode character */ 1380 ); 1381 1382PyAPI_FUNC(int) _PyUnicode_IsUppercase( 1383 Py_UNICODE ch /* Unicode character */ 1384 ); 1385 1386PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 1387 Py_UNICODE ch /* Unicode character */ 1388 ); 1389 1390PyAPI_FUNC(int) _PyUnicode_IsXidStart( 1391 Py_UNICODE ch /* Unicode character */ 1392 ); 1393 1394PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 1395 Py_UNICODE ch /* Unicode character */ 1396 ); 1397 1398PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 1399 const Py_UNICODE ch /* Unicode character */ 1400 ); 1401 1402PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 1403 const Py_UNICODE ch /* Unicode character */ 1404 ); 1405 1406PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase( 1407 Py_UNICODE ch /* Unicode character */ 1408 ); 1409 1410PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase( 1411 Py_UNICODE ch /* Unicode character */ 1412 ); 1413 1414PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase( 1415 Py_UNICODE ch /* Unicode character */ 1416 ); 1417 1418PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 1419 Py_UNICODE ch /* Unicode character */ 1420 ); 1421 1422PyAPI_FUNC(int) _PyUnicode_ToDigit( 1423 Py_UNICODE ch /* Unicode character */ 1424 ); 1425 1426PyAPI_FUNC(double) _PyUnicode_ToNumeric( 1427 Py_UNICODE ch /* Unicode character */ 1428 ); 1429 1430PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 1431 Py_UNICODE ch /* Unicode character */ 1432 ); 1433 1434PyAPI_FUNC(int) _PyUnicode_IsDigit( 1435 Py_UNICODE ch /* Unicode character */ 1436 ); 1437 1438PyAPI_FUNC(int) _PyUnicode_IsNumeric( 1439 Py_UNICODE ch /* Unicode character */ 1440 ); 1441 1442PyAPI_FUNC(int) _PyUnicode_IsAlpha( 1443 Py_UNICODE ch /* Unicode character */ 1444 ); 1445 1446PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u); 1447 1448PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 1449 Py_UNICODE *s1, const Py_UNICODE *s2); 1450 1451PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 1452 Py_UNICODE *s1, const Py_UNICODE *s2, size_t n); 1453 1454PyAPI_FUNC(int) Py_UNICODE_strcmp( 1455 const Py_UNICODE *s1, const Py_UNICODE *s2); 1456 1457PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 1458 const Py_UNICODE *s, Py_UNICODE c 1459 ); 1460 1461#ifdef __cplusplus 1462} 1463#endif 1464#endif /* !Py_UNICODEOBJECT_H */ 1465