unicodeobject.h revision 5e6007c5dbb14a6c64d4b7ee95793465f544bca6
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4/* 5 6Unicode implementation based on original code by Fredrik Lundh, 7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 8Unicode Integration Proposal (see file Misc/unicode.txt). 9 10Copyright (c) Corporation for National Research Initiatives. 11 12 13 Original header: 14 -------------------------------------------------------------------- 15 16 * Yet another Unicode string type for Python. This type supports the 17 * 16-bit Basic Multilingual Plane (BMP) only. 18 * 19 * Written by Fredrik Lundh, January 1999. 20 * 21 * Copyright (c) 1999 by Secret Labs AB. 22 * Copyright (c) 1999 by Fredrik Lundh. 23 * 24 * fredrik@pythonware.com 25 * http://www.pythonware.com 26 * 27 * -------------------------------------------------------------------- 28 * This Unicode String Type is 29 * 30 * Copyright (c) 1999 by Secret Labs AB 31 * Copyright (c) 1999 by Fredrik Lundh 32 * 33 * By obtaining, using, and/or copying this software and/or its 34 * associated documentation, you agree that you have read, understood, 35 * and will comply with the following terms and conditions: 36 * 37 * Permission to use, copy, modify, and distribute this software and its 38 * associated documentation for any purpose and without fee is hereby 39 * granted, provided that the above copyright notice appears in all 40 * copies, and that both that copyright notice and this permission notice 41 * appear in supporting documentation, and that the name of Secret Labs 42 * AB or the author not be used in advertising or publicity pertaining to 43 * distribution of the software without specific, written prior 44 * permission. 45 * 46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 53 * -------------------------------------------------------------------- */ 54 55#include <ctype.h> 56 57/* === Internal API ======================================================= */ 58 59/* --- Internal Unicode Format -------------------------------------------- */ 60 61#ifndef Py_USING_UNICODE 62 63#define PyUnicode_Check(op) 0 64#define PyUnicode_CheckExact(op) 0 65 66#else 67 68/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 69 properly set, but the default rules below doesn't set it. I'll 70 sort this out some other day -- fredrik@pythonware.com */ 71 72#ifndef Py_UNICODE_SIZE 73#error Must define Py_UNICODE_SIZE 74#endif 75 76/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 77 strings are stored as UCS-2 (with limited support for UTF-16) */ 78 79#if Py_UNICODE_SIZE >= 4 80#define Py_UNICODE_WIDE 81#endif 82 83/* Set these flags if the platform has "wchar.h", "wctype.h" and the 84 wchar_t type is a 16-bit unsigned type */ 85/* #define HAVE_WCHAR_H */ 86/* #define HAVE_USABLE_WCHAR_T */ 87 88/* Defaults for various platforms */ 89#ifndef PY_UNICODE_TYPE 90 91/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 92# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 93# define HAVE_USABLE_WCHAR_T 94# define PY_UNICODE_TYPE wchar_t 95# endif 96 97# if defined(Py_UNICODE_WIDE) 98# define PY_UNICODE_TYPE Py_UCS4 99# endif 100 101#endif 102 103/* If the compiler provides a wchar_t type we try to support it 104 through the interface functions PyUnicode_FromWideChar() and 105 PyUnicode_AsWideChar(). */ 106 107#ifdef HAVE_USABLE_WCHAR_T 108# ifndef HAVE_WCHAR_H 109# define HAVE_WCHAR_H 110# endif 111#endif 112 113#ifdef HAVE_WCHAR_H 114/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 115# ifdef _HAVE_BSDI 116# include <time.h> 117# endif 118# include <wchar.h> 119#endif 120 121/* 122 * Use this typedef when you need to represent a UTF-16 surrogate pair 123 * as single unsigned integer. 124 */ 125#if SIZEOF_INT >= 4 126typedef unsigned int Py_UCS4; 127#elif SIZEOF_LONG >= 4 128typedef unsigned long Py_UCS4; 129#endif 130 131typedef PY_UNICODE_TYPE Py_UNICODE; 132 133/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 134 135/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 136 produce different external names and thus cause import errors in 137 case Python interpreters and extensions with mixed compiled in 138 Unicode width assumptions are combined. */ 139 140#ifndef Py_UNICODE_WIDE 141 142# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 143# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 144# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 145# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 146# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 147# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 148# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 149# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 150# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 151# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 152# define PyUnicode_Compare PyUnicodeUCS2_Compare 153# define PyUnicode_Concat PyUnicodeUCS2_Concat 154# define PyUnicode_Contains PyUnicodeUCS2_Contains 155# define PyUnicode_Count PyUnicodeUCS2_Count 156# define PyUnicode_Decode PyUnicodeUCS2_Decode 157# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 158# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 159# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 160# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 161# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 162# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 163# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 164# define PyUnicode_Encode PyUnicodeUCS2_Encode 165# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 166# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 167# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 168# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 169# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 170# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 171# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 172# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 173# define PyUnicode_Find PyUnicodeUCS2_Find 174# define PyUnicode_Format PyUnicodeUCS2_Format 175# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 176# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 177# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 178# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 179# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 180# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 181# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 182# define PyUnicode_Join PyUnicodeUCS2_Join 183# define PyUnicode_Replace PyUnicodeUCS2_Replace 184# define PyUnicode_Resize PyUnicodeUCS2_Resize 185# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding 186# define PyUnicode_Split PyUnicodeUCS2_Split 187# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 188# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 189# define PyUnicode_Translate PyUnicodeUCS2_Translate 190# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 191# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 192# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 193# define _PyUnicode_Init _PyUnicodeUCS2_Init 194# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha 195# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit 196# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit 197# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak 198# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase 199# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric 200# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase 201# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase 202# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace 203# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit 204# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit 205# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase 206# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric 207# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase 208# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase 209 210#else 211 212# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 213# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 214# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 215# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 216# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 217# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 218# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 219# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 220# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 221# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 222# define PyUnicode_Compare PyUnicodeUCS4_Compare 223# define PyUnicode_Concat PyUnicodeUCS4_Concat 224# define PyUnicode_Contains PyUnicodeUCS4_Contains 225# define PyUnicode_Count PyUnicodeUCS4_Count 226# define PyUnicode_Decode PyUnicodeUCS4_Decode 227# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 228# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 229# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 230# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 231# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 232# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 233# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 234# define PyUnicode_Encode PyUnicodeUCS4_Encode 235# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 236# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 237# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 238# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 239# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 240# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 241# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 242# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 243# define PyUnicode_Find PyUnicodeUCS4_Find 244# define PyUnicode_Format PyUnicodeUCS4_Format 245# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 246# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 247# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 248# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 249# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 250# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 251# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 252# define PyUnicode_Join PyUnicodeUCS4_Join 253# define PyUnicode_Replace PyUnicodeUCS4_Replace 254# define PyUnicode_Resize PyUnicodeUCS4_Resize 255# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding 256# define PyUnicode_Split PyUnicodeUCS4_Split 257# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 258# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 259# define PyUnicode_Translate PyUnicodeUCS4_Translate 260# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 261# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 262# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 263# define _PyUnicode_Init _PyUnicodeUCS4_Init 264# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha 265# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit 266# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit 267# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak 268# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase 269# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric 270# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase 271# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase 272# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace 273# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit 274# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit 275# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase 276# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric 277# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase 278# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase 279 280 281#endif 282 283/* --- Internal Unicode Operations ---------------------------------------- */ 284 285/* If you want Python to use the compiler's wctype.h functions instead 286 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or 287 configure Python using --with-ctype-functions. This reduces the 288 interpreter's code size. */ 289 290#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) 291 292#include <wctype.h> 293 294#define Py_UNICODE_ISSPACE(ch) iswspace(ch) 295 296#define Py_UNICODE_ISLOWER(ch) iswlower(ch) 297#define Py_UNICODE_ISUPPER(ch) iswupper(ch) 298#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 299#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 300 301#define Py_UNICODE_TOLOWER(ch) towlower(ch) 302#define Py_UNICODE_TOUPPER(ch) towupper(ch) 303#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 304 305#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 306#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 307#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 308 309#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 310#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 311#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 312 313#define Py_UNICODE_ISALPHA(ch) iswalpha(ch) 314 315#else 316 317#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) 318 319#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 320#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 321#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 322#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 323 324#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 325#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 326#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 327 328#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 329#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 330#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 331 332#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 333#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 334#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 335 336#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 337 338#endif 339 340#define Py_UNICODE_ISALNUM(ch) \ 341 (Py_UNICODE_ISALPHA(ch) || \ 342 Py_UNICODE_ISDECIMAL(ch) || \ 343 Py_UNICODE_ISDIGIT(ch) || \ 344 Py_UNICODE_ISNUMERIC(ch)) 345 346#define Py_UNICODE_COPY(target, source, length)\ 347 (memcpy((target), (source), (length)*sizeof(Py_UNICODE))) 348 349#define Py_UNICODE_FILL(target, value, length) do\ 350 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\ 351 while (0) 352 353#define Py_UNICODE_MATCH(string, offset, substring)\ 354 ((*((string)->str + (offset)) == *((substring)->str)) &&\ 355 !memcmp((string)->str + (offset), (substring)->str,\ 356 (substring)->length*sizeof(Py_UNICODE))) 357 358#ifdef __cplusplus 359extern "C" { 360#endif 361 362/* --- Unicode Type ------------------------------------------------------- */ 363 364typedef struct { 365 PyObject_HEAD 366 int length; /* Length of raw Unicode data in buffer */ 367 Py_UNICODE *str; /* Raw Unicode buffer */ 368 long hash; /* Hash value; -1 if not set */ 369 PyObject *defenc; /* (Default) Encoded version as Python 370 string, or NULL; this is used for 371 implementing the buffer protocol */ 372} PyUnicodeObject; 373 374extern DL_IMPORT(PyTypeObject) PyUnicode_Type; 375 376#define PyUnicode_Check(op) PyObject_TypeCheck(op, &PyUnicode_Type) 377#define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type) 378 379/* Fast access macros */ 380#define PyUnicode_GET_SIZE(op) \ 381 (((PyUnicodeObject *)(op))->length) 382#define PyUnicode_GET_DATA_SIZE(op) \ 383 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) 384#define PyUnicode_AS_UNICODE(op) \ 385 (((PyUnicodeObject *)(op))->str) 386#define PyUnicode_AS_DATA(op) \ 387 ((const char *)((PyUnicodeObject *)(op))->str) 388 389/* --- Constants ---------------------------------------------------------- */ 390 391/* This Unicode character will be used as replacement character during 392 decoding if the errors argument is set to "replace". Note: the 393 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 394 Unicode 3.0. */ 395 396#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 397 398/* === Public API ========================================================= */ 399 400/* --- Plain Py_UNICODE --------------------------------------------------- */ 401 402/* Create a Unicode Object from the Py_UNICODE buffer u of the given 403 size. 404 405 u may be NULL which causes the contents to be undefined. It is the 406 user's responsibility to fill in the needed data afterwards. Note 407 that modifying the Unicode object contents after construction is 408 only allowed if u was set to NULL. 409 410 The buffer is copied into the new object. */ 411 412extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode( 413 const Py_UNICODE *u, /* Unicode buffer */ 414 int size /* size of buffer */ 415 ); 416 417/* Return a read-only pointer to the Unicode object's internal 418 Py_UNICODE buffer. */ 419 420extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode( 421 PyObject *unicode /* Unicode object */ 422 ); 423 424/* Get the length of the Unicode object. */ 425 426extern DL_IMPORT(int) PyUnicode_GetSize( 427 PyObject *unicode /* Unicode object */ 428 ); 429 430/* Get the maximum ordinal for a Unicode character. */ 431extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void); 432 433/* Resize an already allocated Unicode object to the new size length. 434 435 *unicode is modified to point to the new (resized) object and 0 436 returned on success. 437 438 This API may only be called by the function which also called the 439 Unicode constructor. The refcount on the object must be 1. Otherwise, 440 an error is returned. 441 442 Error handling is implemented as follows: an exception is set, -1 443 is returned and *unicode left untouched. 444 445*/ 446 447extern DL_IMPORT(int) PyUnicode_Resize( 448 PyObject **unicode, /* Pointer to the Unicode object */ 449 int length /* New length */ 450 ); 451 452/* Coerce obj to an Unicode object and return a reference with 453 *incremented* refcount. 454 455 Coercion is done in the following way: 456 457 1. Unicode objects are passed back as-is with incremented 458 refcount. 459 460 2. String and other char buffer compatible objects are decoded 461 under the assumptions that they contain data using the current 462 default encoding. Decoding is done in "strict" mode. 463 464 3. All other objects raise an exception. 465 466 The API returns NULL in case of an error. The caller is responsible 467 for decref'ing the returned objects. 468 469*/ 470 471extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject( 472 register PyObject *obj, /* Object */ 473 const char *encoding, /* encoding */ 474 const char *errors /* error handling */ 475 ); 476 477/* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict"); 478 which results in using the default encoding as basis for 479 decoding the object. 480 481 Coerces obj to an Unicode object and return a reference with 482 *incremented* refcount. 483 484 The API returns NULL in case of an error. The caller is responsible 485 for decref'ing the returned objects. 486 487*/ 488 489extern DL_IMPORT(PyObject*) PyUnicode_FromObject( 490 register PyObject *obj /* Object */ 491 ); 492 493/* --- wchar_t support for platforms which support it --------------------- */ 494 495#ifdef HAVE_WCHAR_H 496 497/* Create a Unicode Object from the whcar_t buffer w of the given 498 size. 499 500 The buffer is copied into the new object. */ 501 502extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar( 503 register const wchar_t *w, /* wchar_t buffer */ 504 int size /* size of buffer */ 505 ); 506 507/* Copies the Unicode Object contents into the whcar_t buffer w. At 508 most size wchar_t characters are copied. 509 510 Returns the number of wchar_t characters copied or -1 in case of an 511 error. */ 512 513extern DL_IMPORT(int) PyUnicode_AsWideChar( 514 PyUnicodeObject *unicode, /* Unicode object */ 515 register wchar_t *w, /* wchar_t buffer */ 516 int size /* size of buffer */ 517 ); 518 519#endif 520 521/* === Builtin Codecs ===================================================== 522 523 Many of these APIs take two arguments encoding and errors. These 524 parameters encoding and errors have the same semantics as the ones 525 of the builtin unicode() API. 526 527 Setting encoding to NULL causes the default encoding to be used. 528 529 Error handling is set by errors which may also be set to NULL 530 meaning to use the default handling defined for the codec. Default 531 error handling for all builtin codecs is "strict" (ValueErrors are 532 raised). 533 534 The codecs all use a similar interface. Only deviation from the 535 generic ones are documented. 536 537*/ 538 539/* --- Manage the default encoding ---------------------------------------- */ 540 541/* Return a Python string holding the default encoded value of the 542 Unicode object. 543 544 The resulting string is cached in the Unicode object for subsequent 545 usage by this function. The cached version is needed to implement 546 the character buffer interface and will live (at least) as long as 547 the Unicode object itself. 548 549 The refcount of the string is *not* incremented. 550 551 *** Exported for internal use by the interpreter only !!! *** 552 553*/ 554 555extern DL_IMPORT(PyObject *) _PyUnicode_AsDefaultEncodedString( 556 PyObject *, const char *); 557 558/* Returns the currently active default encoding. 559 560 The default encoding is currently implemented as run-time settable 561 process global. This may change in future versions of the 562 interpreter to become a parameter which is managed on a per-thread 563 basis. 564 565 */ 566 567extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void); 568 569/* Sets the currently active default encoding. 570 571 Returns 0 on success, -1 in case of an error. 572 573 */ 574 575extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding( 576 const char *encoding /* Encoding name in standard form */ 577 ); 578 579/* --- Generic Codecs ----------------------------------------------------- */ 580 581/* Create a Unicode object by decoding the encoded string s of the 582 given size. */ 583 584extern DL_IMPORT(PyObject*) PyUnicode_Decode( 585 const char *s, /* encoded string */ 586 int size, /* size of buffer */ 587 const char *encoding, /* encoding */ 588 const char *errors /* error handling */ 589 ); 590 591/* Encodes a Py_UNICODE buffer of the given size and returns a 592 Python string object. */ 593 594extern DL_IMPORT(PyObject*) PyUnicode_Encode( 595 const Py_UNICODE *s, /* Unicode char buffer */ 596 int size, /* number of Py_UNICODE chars to encode */ 597 const char *encoding, /* encoding */ 598 const char *errors /* error handling */ 599 ); 600 601/* Encodes a Unicode object and returns the result as Python string 602 object. */ 603 604extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString( 605 PyObject *unicode, /* Unicode object */ 606 const char *encoding, /* encoding */ 607 const char *errors /* error handling */ 608 ); 609 610/* --- UTF-8 Codecs ------------------------------------------------------- */ 611 612extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8( 613 const char *string, /* UTF-8 encoded string */ 614 int length, /* size of string */ 615 const char *errors /* error handling */ 616 ); 617 618extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String( 619 PyObject *unicode /* Unicode object */ 620 ); 621 622extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( 623 const Py_UNICODE *data, /* Unicode char buffer */ 624 int length, /* number of Py_UNICODE chars to encode */ 625 const char *errors /* error handling */ 626 ); 627 628/* --- UTF-16 Codecs ------------------------------------------------------ */ 629 630/* Decodes length bytes from a UTF-16 encoded buffer string and returns 631 the corresponding Unicode object. 632 633 errors (if non-NULL) defines the error handling. It defaults 634 to "strict". 635 636 If byteorder is non-NULL, the decoder starts decoding using the 637 given byte order: 638 639 *byteorder == -1: little endian 640 *byteorder == 0: native order 641 *byteorder == 1: big endian 642 643 In native mode, the first two bytes of the stream are checked for a 644 BOM mark. If found, the BOM mark is analysed, the byte order 645 adjusted and the BOM skipped. In the other modes, no BOM mark 646 interpretation is done. After completion, *byteorder is set to the 647 current byte order at the end of input data. 648 649 If byteorder is NULL, the codec starts in native order mode. 650 651*/ 652 653extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16( 654 const char *string, /* UTF-16 encoded string */ 655 int length, /* size of string */ 656 const char *errors, /* error handling */ 657 int *byteorder /* pointer to byteorder to use 658 0=native;-1=LE,1=BE; updated on 659 exit */ 660 ); 661 662/* Returns a Python string using the UTF-16 encoding in native byte 663 order. The string always starts with a BOM mark. */ 664 665extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String( 666 PyObject *unicode /* Unicode object */ 667 ); 668 669/* Returns a Python string object holding the UTF-16 encoded value of 670 the Unicode data. 671 672 If byteorder is not 0, output is written according to the following 673 byte order: 674 675 byteorder == -1: little endian 676 byteorder == 0: native byte order (writes a BOM mark) 677 byteorder == 1: big endian 678 679 If byteorder is 0, the output string will always start with the 680 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 681 prepended. 682 683 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 684 UCS-2. This trick makes it possible to add full UTF-16 capabilities 685 at a later point without compromising the APIs. 686 687*/ 688 689extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16( 690 const Py_UNICODE *data, /* Unicode char buffer */ 691 int length, /* number of Py_UNICODE chars to encode */ 692 const char *errors, /* error handling */ 693 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 694 ); 695 696/* --- Unicode-Escape Codecs ---------------------------------------------- */ 697 698extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape( 699 const char *string, /* Unicode-Escape encoded string */ 700 int length, /* size of string */ 701 const char *errors /* error handling */ 702 ); 703 704extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString( 705 PyObject *unicode /* Unicode object */ 706 ); 707 708extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape( 709 const Py_UNICODE *data, /* Unicode char buffer */ 710 int length /* Number of Py_UNICODE chars to encode */ 711 ); 712 713/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 714 715extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 716 const char *string, /* Raw-Unicode-Escape encoded string */ 717 int length, /* size of string */ 718 const char *errors /* error handling */ 719 ); 720 721extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 722 PyObject *unicode /* Unicode object */ 723 ); 724 725extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 726 const Py_UNICODE *data, /* Unicode char buffer */ 727 int length /* Number of Py_UNICODE chars to encode */ 728 ); 729 730/* --- Latin-1 Codecs ----------------------------------------------------- 731 732 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 733 734*/ 735 736extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1( 737 const char *string, /* Latin-1 encoded string */ 738 int length, /* size of string */ 739 const char *errors /* error handling */ 740 ); 741 742extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String( 743 PyObject *unicode /* Unicode object */ 744 ); 745 746extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1( 747 const Py_UNICODE *data, /* Unicode char buffer */ 748 int length, /* Number of Py_UNICODE chars to encode */ 749 const char *errors /* error handling */ 750 ); 751 752/* --- ASCII Codecs ------------------------------------------------------- 753 754 Only 7-bit ASCII data is excepted. All other codes generate errors. 755 756*/ 757 758extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII( 759 const char *string, /* ASCII encoded string */ 760 int length, /* size of string */ 761 const char *errors /* error handling */ 762 ); 763 764extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString( 765 PyObject *unicode /* Unicode object */ 766 ); 767 768extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII( 769 const Py_UNICODE *data, /* Unicode char buffer */ 770 int length, /* Number of Py_UNICODE chars to encode */ 771 const char *errors /* error handling */ 772 ); 773 774/* --- Character Map Codecs ----------------------------------------------- 775 776 This codec uses mappings to encode and decode characters. 777 778 Decoding mappings must map single string characters to single 779 Unicode characters, integers (which are then interpreted as Unicode 780 ordinals) or None (meaning "undefined mapping" and causing an 781 error). 782 783 Encoding mappings must map single Unicode characters to single 784 string characters, integers (which are then interpreted as Latin-1 785 ordinals) or None (meaning "undefined mapping" and causing an 786 error). 787 788 If a character lookup fails with a LookupError, the character is 789 copied as-is meaning that its ordinal value will be interpreted as 790 Unicode or Latin-1 ordinal resp. Because of this mappings only need 791 to contain those mappings which map characters to different code 792 points. 793 794*/ 795 796extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap( 797 const char *string, /* Encoded string */ 798 int length, /* size of string */ 799 PyObject *mapping, /* character mapping 800 (char ordinal -> unicode ordinal) */ 801 const char *errors /* error handling */ 802 ); 803 804extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString( 805 PyObject *unicode, /* Unicode object */ 806 PyObject *mapping /* character mapping 807 (unicode ordinal -> char ordinal) */ 808 ); 809 810extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap( 811 const Py_UNICODE *data, /* Unicode char buffer */ 812 int length, /* Number of Py_UNICODE chars to encode */ 813 PyObject *mapping, /* character mapping 814 (unicode ordinal -> char ordinal) */ 815 const char *errors /* error handling */ 816 ); 817 818/* Translate a Py_UNICODE buffer of the given length by applying a 819 character mapping table to it and return the resulting Unicode 820 object. 821 822 The mapping table must map Unicode ordinal integers to Unicode 823 ordinal integers or None (causing deletion of the character). 824 825 Mapping tables may be dictionaries or sequences. Unmapped character 826 ordinals (ones which cause a LookupError) are left untouched and 827 are copied as-is. 828 829*/ 830 831extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap( 832 const Py_UNICODE *data, /* Unicode char buffer */ 833 int length, /* Number of Py_UNICODE chars to encode */ 834 PyObject *table, /* Translate table */ 835 const char *errors /* error handling */ 836 ); 837 838#ifdef MS_WIN32 839 840/* --- MBCS codecs for Windows -------------------------------------------- */ 841 842extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS( 843 const char *string, /* MBCS encoded string */ 844 int length, /* size of string */ 845 const char *errors /* error handling */ 846 ); 847 848extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString( 849 PyObject *unicode /* Unicode object */ 850 ); 851 852extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS( 853 const Py_UNICODE *data, /* Unicode char buffer */ 854 int length, /* Number of Py_UNICODE chars to encode */ 855 const char *errors /* error handling */ 856 ); 857 858#endif /* MS_WIN32 */ 859 860/* --- Decimal Encoder ---------------------------------------------------- */ 861 862/* Takes a Unicode string holding a decimal value and writes it into 863 an output buffer using standard ASCII digit codes. 864 865 The output buffer has to provide at least length+1 bytes of storage 866 area. The output string is 0-terminated. 867 868 The encoder converts whitespace to ' ', decimal characters to their 869 corresponding ASCII digit and all other Latin-1 characters except 870 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 871 are treated as errors. This includes embedded NULL bytes. 872 873 Error handling is defined by the errors argument: 874 875 NULL or "strict": raise a ValueError 876 "ignore": ignore the wrong characters (these are not copied to the 877 output buffer) 878 "replace": replaces illegal characters with '?' 879 880 Returns 0 on success, -1 on failure. 881 882*/ 883 884extern DL_IMPORT(int) PyUnicode_EncodeDecimal( 885 Py_UNICODE *s, /* Unicode buffer */ 886 int length, /* Number of Py_UNICODE chars to encode */ 887 char *output, /* Output buffer; must have size >= length */ 888 const char *errors /* error handling */ 889 ); 890 891/* --- Methods & Slots ---------------------------------------------------- 892 893 These are capable of handling Unicode objects and strings on input 894 (we refer to them as strings in the descriptions) and return 895 Unicode objects or integers as apporpriate. */ 896 897/* Concat two strings giving a new Unicode string. */ 898 899extern DL_IMPORT(PyObject*) PyUnicode_Concat( 900 PyObject *left, /* Left string */ 901 PyObject *right /* Right string */ 902 ); 903 904/* Split a string giving a list of Unicode strings. 905 906 If sep is NULL, splitting will be done at all whitespace 907 substrings. Otherwise, splits occur at the given separator. 908 909 At most maxsplit splits will be done. If negative, no limit is set. 910 911 Separators are not included in the resulting list. 912 913*/ 914 915extern DL_IMPORT(PyObject*) PyUnicode_Split( 916 PyObject *s, /* String to split */ 917 PyObject *sep, /* String separator */ 918 int maxsplit /* Maxsplit count */ 919 ); 920 921/* Dito, but split at line breaks. 922 923 CRLF is considered to be one line break. Line breaks are not 924 included in the resulting list. */ 925 926extern DL_IMPORT(PyObject*) PyUnicode_Splitlines( 927 PyObject *s, /* String to split */ 928 int keepends /* If true, line end markers are included */ 929 ); 930 931/* Translate a string by applying a character mapping table to it and 932 return the resulting Unicode object. 933 934 The mapping table must map Unicode ordinal integers to Unicode 935 ordinal integers or None (causing deletion of the character). 936 937 Mapping tables may be dictionaries or sequences. Unmapped character 938 ordinals (ones which cause a LookupError) are left untouched and 939 are copied as-is. 940 941*/ 942 943extern DL_IMPORT(PyObject *) PyUnicode_Translate( 944 PyObject *str, /* String */ 945 PyObject *table, /* Translate table */ 946 const char *errors /* error handling */ 947 ); 948 949/* Join a sequence of strings using the given separator and return 950 the resulting Unicode string. */ 951 952extern DL_IMPORT(PyObject*) PyUnicode_Join( 953 PyObject *separator, /* Separator string */ 954 PyObject *seq /* Sequence object */ 955 ); 956 957/* Return 1 if substr matches str[start:end] at the given tail end, 0 958 otherwise. */ 959 960extern DL_IMPORT(int) PyUnicode_Tailmatch( 961 PyObject *str, /* String */ 962 PyObject *substr, /* Prefix or Suffix string */ 963 int start, /* Start index */ 964 int end, /* Stop index */ 965 int direction /* Tail end: -1 prefix, +1 suffix */ 966 ); 967 968/* Return the first position of substr in str[start:end] using the 969 given search direction or -1 if not found. */ 970 971extern DL_IMPORT(int) PyUnicode_Find( 972 PyObject *str, /* String */ 973 PyObject *substr, /* Substring to find */ 974 int start, /* Start index */ 975 int end, /* Stop index */ 976 int direction /* Find direction: +1 forward, -1 backward */ 977 ); 978 979/* Count the number of occurrences of substr in str[start:end]. */ 980 981extern DL_IMPORT(int) PyUnicode_Count( 982 PyObject *str, /* String */ 983 PyObject *substr, /* Substring to count */ 984 int start, /* Start index */ 985 int end /* Stop index */ 986 ); 987 988/* Replace at most maxcount occurrences of substr in str with replstr 989 and return the resulting Unicode object. */ 990 991extern DL_IMPORT(PyObject *) PyUnicode_Replace( 992 PyObject *str, /* String */ 993 PyObject *substr, /* Substring to find */ 994 PyObject *replstr, /* Substring to replace */ 995 int maxcount /* Max. number of replacements to apply; 996 -1 = all */ 997 ); 998 999/* Compare two strings and return -1, 0, 1 for less than, equal, 1000 greater than resp. */ 1001 1002extern DL_IMPORT(int) PyUnicode_Compare( 1003 PyObject *left, /* Left string */ 1004 PyObject *right /* Right string */ 1005 ); 1006 1007/* Apply a argument tuple or dictionary to a format string and return 1008 the resulting Unicode string. */ 1009 1010extern DL_IMPORT(PyObject *) PyUnicode_Format( 1011 PyObject *format, /* Format string */ 1012 PyObject *args /* Argument tuple or dictionary */ 1013 ); 1014 1015/* Checks whether element is contained in container and return 1/0 1016 accordingly. 1017 1018 element has to coerce to an one element Unicode string. -1 is 1019 returned in case of an error. */ 1020 1021extern DL_IMPORT(int) PyUnicode_Contains( 1022 PyObject *container, /* Container string */ 1023 PyObject *element /* Element string */ 1024 ); 1025 1026/* === Characters Type APIs =============================================== */ 1027 1028/* These should not be used directly. Use the Py_UNICODE_IS* and 1029 Py_UNICODE_TO* macros instead. 1030 1031 These APIs are implemented in Objects/unicodectype.c. 1032 1033*/ 1034 1035extern DL_IMPORT(int) _PyUnicode_IsLowercase( 1036 Py_UNICODE ch /* Unicode character */ 1037 ); 1038 1039extern DL_IMPORT(int) _PyUnicode_IsUppercase( 1040 Py_UNICODE ch /* Unicode character */ 1041 ); 1042 1043extern DL_IMPORT(int) _PyUnicode_IsTitlecase( 1044 Py_UNICODE ch /* Unicode character */ 1045 ); 1046 1047extern DL_IMPORT(int) _PyUnicode_IsWhitespace( 1048 Py_UNICODE ch /* Unicode character */ 1049 ); 1050 1051extern DL_IMPORT(int) _PyUnicode_IsLinebreak( 1052 Py_UNICODE ch /* Unicode character */ 1053 ); 1054 1055extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase( 1056 Py_UNICODE ch /* Unicode character */ 1057 ); 1058 1059extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase( 1060 Py_UNICODE ch /* Unicode character */ 1061 ); 1062 1063extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase( 1064 Py_UNICODE ch /* Unicode character */ 1065 ); 1066 1067extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit( 1068 Py_UNICODE ch /* Unicode character */ 1069 ); 1070 1071extern DL_IMPORT(int) _PyUnicode_ToDigit( 1072 Py_UNICODE ch /* Unicode character */ 1073 ); 1074 1075extern DL_IMPORT(double) _PyUnicode_ToNumeric( 1076 Py_UNICODE ch /* Unicode character */ 1077 ); 1078 1079extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit( 1080 Py_UNICODE ch /* Unicode character */ 1081 ); 1082 1083extern DL_IMPORT(int) _PyUnicode_IsDigit( 1084 Py_UNICODE ch /* Unicode character */ 1085 ); 1086 1087extern DL_IMPORT(int) _PyUnicode_IsNumeric( 1088 Py_UNICODE ch /* Unicode character */ 1089 ); 1090 1091extern DL_IMPORT(int) _PyUnicode_IsAlpha( 1092 Py_UNICODE ch /* Unicode character */ 1093 ); 1094 1095#ifdef __cplusplus 1096} 1097#endif 1098#endif /* Py_USING_UNICODE */ 1099#endif /* !Py_UNICODEOBJECT_H */ 1100