unicodeobject.h revision b8c65bc27ffc61c659180c351d3cc283abd1be45
1#ifndef Py_UNICODEOBJECT_H 2#define Py_UNICODEOBJECT_H 3 4/* 5 6Unicode implementation based on original code by Fredrik Lundh, 7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 8Unicode Integration Proposal (see file Misc/unicode.txt). 9 10Copyright (c) Corporation for National Research Initiatives. 11 12 13 Original header: 14 -------------------------------------------------------------------- 15 16 * Yet another Unicode string type for Python. This type supports the 17 * 16-bit Basic Multilingual Plane (BMP) only. 18 * 19 * Written by Fredrik Lundh, January 1999. 20 * 21 * Copyright (c) 1999 by Secret Labs AB. 22 * Copyright (c) 1999 by Fredrik Lundh. 23 * 24 * fredrik@pythonware.com 25 * http://www.pythonware.com 26 * 27 * -------------------------------------------------------------------- 28 * This Unicode String Type is 29 * 30 * Copyright (c) 1999 by Secret Labs AB 31 * Copyright (c) 1999 by Fredrik Lundh 32 * 33 * By obtaining, using, and/or copying this software and/or its 34 * associated documentation, you agree that you have read, understood, 35 * and will comply with the following terms and conditions: 36 * 37 * Permission to use, copy, modify, and distribute this software and its 38 * associated documentation for any purpose and without fee is hereby 39 * granted, provided that the above copyright notice appears in all 40 * copies, and that both that copyright notice and this permission notice 41 * appear in supporting documentation, and that the name of Secret Labs 42 * AB or the author not be used in advertising or publicity pertaining to 43 * distribution of the software without specific, written prior 44 * permission. 45 * 46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 53 * -------------------------------------------------------------------- */ 54 55#include <ctype.h> 56 57/* === Internal API ======================================================= */ 58 59/* --- Internal Unicode Format -------------------------------------------- */ 60 61#ifndef Py_USING_UNICODE 62 63#define PyUnicode_Check(op) 0 64#define PyUnicode_CheckExact(op) 0 65 66#else 67 68/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is 69 properly set, but the default rules below doesn't set it. I'll 70 sort this out some other day -- fredrik@pythonware.com */ 71 72#ifndef Py_UNICODE_SIZE 73#error Must define Py_UNICODE_SIZE 74#endif 75 76/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode 77 strings are stored as UCS-2 (with limited support for UTF-16) */ 78 79#if Py_UNICODE_SIZE >= 4 80#define Py_UNICODE_WIDE 81#endif 82 83/* Set these flags if the platform has "wchar.h", "wctype.h" and the 84 wchar_t type is a 16-bit unsigned type */ 85/* #define HAVE_WCHAR_H */ 86/* #define HAVE_USABLE_WCHAR_T */ 87 88/* Defaults for various platforms */ 89#ifndef PY_UNICODE_TYPE 90 91/* Windows has a usable wchar_t type (unless we're using UCS-4) */ 92# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2 93# define HAVE_USABLE_WCHAR_T 94# define PY_UNICODE_TYPE wchar_t 95# endif 96 97# if defined(Py_UNICODE_WIDE) 98# define PY_UNICODE_TYPE Py_UCS4 99# endif 100 101#endif 102 103/* If the compiler provides a wchar_t type we try to support it 104 through the interface functions PyUnicode_FromWideChar() and 105 PyUnicode_AsWideChar(). */ 106 107#ifdef HAVE_USABLE_WCHAR_T 108# ifndef HAVE_WCHAR_H 109# define HAVE_WCHAR_H 110# endif 111#endif 112 113#ifdef HAVE_WCHAR_H 114/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 115# ifdef _HAVE_BSDI 116# include <time.h> 117# endif 118# include <wchar.h> 119#endif 120 121/* 122 * Use this typedef when you need to represent a UTF-16 surrogate pair 123 * as single unsigned integer. 124 */ 125#if SIZEOF_INT >= 4 126typedef unsigned int Py_UCS4; 127#elif SIZEOF_LONG >= 4 128typedef unsigned long Py_UCS4; 129#endif 130 131typedef PY_UNICODE_TYPE Py_UNICODE; 132 133/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ 134 135/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds 136 produce different external names and thus cause import errors in 137 case Python interpreters and extensions with mixed compiled in 138 Unicode width assumptions are combined. */ 139 140#ifndef Py_UNICODE_WIDE 141 142# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString 143# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString 144# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString 145# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String 146# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString 147# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String 148# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String 149# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode 150# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString 151# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar 152# define PyUnicode_Compare PyUnicodeUCS2_Compare 153# define PyUnicode_Concat PyUnicodeUCS2_Concat 154# define PyUnicode_Contains PyUnicodeUCS2_Contains 155# define PyUnicode_Count PyUnicodeUCS2_Count 156# define PyUnicode_Decode PyUnicodeUCS2_Decode 157# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII 158# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap 159# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 160# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape 161# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 162# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 163# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape 164# define PyUnicode_Encode PyUnicodeUCS2_Encode 165# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII 166# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap 167# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal 168# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 169# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape 170# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 171# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 172# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape 173# define PyUnicode_Find PyUnicodeUCS2_Find 174# define PyUnicode_Format PyUnicodeUCS2_Format 175# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject 176# define PyUnicode_FromObject PyUnicodeUCS2_FromObject 177# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode 178# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar 179# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding 180# define PyUnicode_GetMax PyUnicodeUCS2_GetMax 181# define PyUnicode_GetSize PyUnicodeUCS2_GetSize 182# define PyUnicode_Join PyUnicodeUCS2_Join 183# define PyUnicode_Replace PyUnicodeUCS2_Replace 184# define PyUnicode_Resize PyUnicodeUCS2_Resize 185# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding 186# define PyUnicode_Split PyUnicodeUCS2_Split 187# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines 188# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch 189# define PyUnicode_Translate PyUnicodeUCS2_Translate 190# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap 191# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString 192# define _PyUnicode_Fini _PyUnicodeUCS2_Fini 193# define _PyUnicode_Init _PyUnicodeUCS2_Init 194# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha 195# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit 196# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit 197# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak 198# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase 199# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric 200# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase 201# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase 202# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace 203# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit 204# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit 205# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase 206# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric 207# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase 208# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase 209 210#else 211 212# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString 213# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString 214# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString 215# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String 216# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString 217# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String 218# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String 219# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode 220# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString 221# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar 222# define PyUnicode_Compare PyUnicodeUCS4_Compare 223# define PyUnicode_Concat PyUnicodeUCS4_Concat 224# define PyUnicode_Contains PyUnicodeUCS4_Contains 225# define PyUnicode_Count PyUnicodeUCS4_Count 226# define PyUnicode_Decode PyUnicodeUCS4_Decode 227# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII 228# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap 229# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 230# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape 231# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 232# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 233# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape 234# define PyUnicode_Encode PyUnicodeUCS4_Encode 235# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII 236# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap 237# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal 238# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 239# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape 240# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 241# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 242# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape 243# define PyUnicode_Find PyUnicodeUCS4_Find 244# define PyUnicode_Format PyUnicodeUCS4_Format 245# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject 246# define PyUnicode_FromObject PyUnicodeUCS4_FromObject 247# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode 248# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar 249# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding 250# define PyUnicode_GetMax PyUnicodeUCS4_GetMax 251# define PyUnicode_GetSize PyUnicodeUCS4_GetSize 252# define PyUnicode_Join PyUnicodeUCS4_Join 253# define PyUnicode_Replace PyUnicodeUCS4_Replace 254# define PyUnicode_Resize PyUnicodeUCS4_Resize 255# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding 256# define PyUnicode_Split PyUnicodeUCS4_Split 257# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines 258# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch 259# define PyUnicode_Translate PyUnicodeUCS4_Translate 260# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap 261# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString 262# define _PyUnicode_Fini _PyUnicodeUCS4_Fini 263# define _PyUnicode_Init _PyUnicodeUCS4_Init 264# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha 265# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit 266# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit 267# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak 268# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase 269# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric 270# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase 271# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase 272# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace 273# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit 274# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit 275# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase 276# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric 277# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase 278# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase 279 280 281#endif 282 283/* --- Internal Unicode Operations ---------------------------------------- */ 284 285/* If you want Python to use the compiler's wctype.h functions instead 286 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or 287 configure Python using --with-ctype-functions. This reduces the 288 interpreter's code size. */ 289 290#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) 291 292#include <wctype.h> 293 294#define Py_UNICODE_ISSPACE(ch) iswspace(ch) 295 296#define Py_UNICODE_ISLOWER(ch) iswlower(ch) 297#define Py_UNICODE_ISUPPER(ch) iswupper(ch) 298#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 299#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 300 301#define Py_UNICODE_TOLOWER(ch) towlower(ch) 302#define Py_UNICODE_TOUPPER(ch) towupper(ch) 303#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 304 305#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 306#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 307#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 308 309#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 310#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 311#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 312 313#define Py_UNICODE_ISALPHA(ch) iswalpha(ch) 314 315#else 316 317#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch) 318 319#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 320#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 321#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 322#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 323 324#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 325#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 326#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 327 328#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 329#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 330#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 331 332#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 333#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 334#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 335 336#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 337 338#endif 339 340#define Py_UNICODE_ISALNUM(ch) \ 341 (Py_UNICODE_ISALPHA(ch) || \ 342 Py_UNICODE_ISDECIMAL(ch) || \ 343 Py_UNICODE_ISDIGIT(ch) || \ 344 Py_UNICODE_ISNUMERIC(ch)) 345 346#define Py_UNICODE_COPY(target, source, length)\ 347 (memcpy((target), (source), (length)*sizeof(Py_UNICODE))) 348 349#define Py_UNICODE_FILL(target, value, length) do\ 350 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\ 351 while (0) 352 353#define Py_UNICODE_MATCH(string, offset, substring)\ 354 ((*((string)->str + (offset)) == *((substring)->str)) &&\ 355 !memcmp((string)->str + (offset), (substring)->str,\ 356 (substring)->length*sizeof(Py_UNICODE))) 357 358#ifdef __cplusplus 359extern "C" { 360#endif 361 362/* --- Unicode Type ------------------------------------------------------- */ 363 364typedef struct { 365 PyObject_HEAD 366 int length; /* Length of raw Unicode data in buffer */ 367 Py_UNICODE *str; /* Raw Unicode buffer */ 368 long hash; /* Hash value; -1 if not set */ 369 PyObject *defenc; /* (Default) Encoded version as Python 370 string, or NULL; this is used for 371 implementing the buffer protocol */ 372} PyUnicodeObject; 373 374extern DL_IMPORT(PyTypeObject) PyUnicode_Type; 375 376#define PyUnicode_Check(op) PyObject_TypeCheck(op, &PyUnicode_Type) 377#define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type) 378 379/* Fast access macros */ 380#define PyUnicode_GET_SIZE(op) \ 381 (((PyUnicodeObject *)(op))->length) 382#define PyUnicode_GET_DATA_SIZE(op) \ 383 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)) 384#define PyUnicode_AS_UNICODE(op) \ 385 (((PyUnicodeObject *)(op))->str) 386#define PyUnicode_AS_DATA(op) \ 387 ((const char *)((PyUnicodeObject *)(op))->str) 388 389/* --- Constants ---------------------------------------------------------- */ 390 391/* This Unicode character will be used as replacement character during 392 decoding if the errors argument is set to "replace". Note: the 393 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 394 Unicode 3.0. */ 395 396#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD) 397 398/* === Public API ========================================================= */ 399 400/* --- Plain Py_UNICODE --------------------------------------------------- */ 401 402/* Create a Unicode Object from the Py_UNICODE buffer u of the given 403 size. 404 405 u may be NULL which causes the contents to be undefined. It is the 406 user's responsibility to fill in the needed data afterwards. Note 407 that modifying the Unicode object contents after construction is 408 only allowed if u was set to NULL. 409 410 The buffer is copied into the new object. */ 411 412extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode( 413 const Py_UNICODE *u, /* Unicode buffer */ 414 int size /* size of buffer */ 415 ); 416 417/* Return a read-only pointer to the Unicode object's internal 418 Py_UNICODE buffer. */ 419 420extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode( 421 PyObject *unicode /* Unicode object */ 422 ); 423 424/* Get the length of the Unicode object. */ 425 426extern DL_IMPORT(int) PyUnicode_GetSize( 427 PyObject *unicode /* Unicode object */ 428 ); 429 430/* Get the maximum ordinal for a Unicode character. */ 431extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void); 432 433/* Resize an already allocated Unicode object to the new size length. 434 435 *unicode is modified to point to the new (resized) object and 0 436 returned on success. 437 438 This API may only be called by the function which also called the 439 Unicode constructor. The refcount on the object must be 1. Otherwise, 440 an error is returned. 441 442 Error handling is implemented as follows: an exception is set, -1 443 is returned and *unicode left untouched. 444 445*/ 446 447extern DL_IMPORT(int) PyUnicode_Resize( 448 PyObject **unicode, /* Pointer to the Unicode object */ 449 int length /* New length */ 450 ); 451 452/* Coerce obj to an Unicode object and return a reference with 453 *incremented* refcount. 454 455 Coercion is done in the following way: 456 457 1. String and other char buffer compatible objects are decoded 458 under the assumptions that they contain data using the current 459 default encoding. Decoding is done in "strict" mode. 460 461 2. All other objects (including Unicode objects) raise an 462 exception. 463 464 The API returns NULL in case of an error. The caller is responsible 465 for decref'ing the returned objects. 466 467*/ 468 469extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject( 470 register PyObject *obj, /* Object */ 471 const char *encoding, /* encoding */ 472 const char *errors /* error handling */ 473 ); 474 475/* Coerce obj to an Unicode object and return a reference with 476 *incremented* refcount. 477 478 Unicode objects are passed back as-is (subclasses are converted to 479 true Unicode objects), all other objects are delegated to 480 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in 481 using the default encoding as basis for decoding the object. 482 483 The API returns NULL in case of an error. The caller is responsible 484 for decref'ing the returned objects. 485 486*/ 487 488extern DL_IMPORT(PyObject*) PyUnicode_FromObject( 489 register PyObject *obj /* Object */ 490 ); 491 492/* --- wchar_t support for platforms which support it --------------------- */ 493 494#ifdef HAVE_WCHAR_H 495 496/* Create a Unicode Object from the whcar_t buffer w of the given 497 size. 498 499 The buffer is copied into the new object. */ 500 501extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar( 502 register const wchar_t *w, /* wchar_t buffer */ 503 int size /* size of buffer */ 504 ); 505 506/* Copies the Unicode Object contents into the whcar_t buffer w. At 507 most size wchar_t characters are copied. 508 509 Returns the number of wchar_t characters copied or -1 in case of an 510 error. */ 511 512extern DL_IMPORT(int) PyUnicode_AsWideChar( 513 PyUnicodeObject *unicode, /* Unicode object */ 514 register wchar_t *w, /* wchar_t buffer */ 515 int size /* size of buffer */ 516 ); 517 518#endif 519 520/* === Builtin Codecs ===================================================== 521 522 Many of these APIs take two arguments encoding and errors. These 523 parameters encoding and errors have the same semantics as the ones 524 of the builtin unicode() API. 525 526 Setting encoding to NULL causes the default encoding to be used. 527 528 Error handling is set by errors which may also be set to NULL 529 meaning to use the default handling defined for the codec. Default 530 error handling for all builtin codecs is "strict" (ValueErrors are 531 raised). 532 533 The codecs all use a similar interface. Only deviation from the 534 generic ones are documented. 535 536*/ 537 538/* --- Manage the default encoding ---------------------------------------- */ 539 540/* Return a Python string holding the default encoded value of the 541 Unicode object. 542 543 The resulting string is cached in the Unicode object for subsequent 544 usage by this function. The cached version is needed to implement 545 the character buffer interface and will live (at least) as long as 546 the Unicode object itself. 547 548 The refcount of the string is *not* incremented. 549 550 *** Exported for internal use by the interpreter only !!! *** 551 552*/ 553 554extern DL_IMPORT(PyObject *) _PyUnicode_AsDefaultEncodedString( 555 PyObject *, const char *); 556 557/* Returns the currently active default encoding. 558 559 The default encoding is currently implemented as run-time settable 560 process global. This may change in future versions of the 561 interpreter to become a parameter which is managed on a per-thread 562 basis. 563 564 */ 565 566extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void); 567 568/* Sets the currently active default encoding. 569 570 Returns 0 on success, -1 in case of an error. 571 572 */ 573 574extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding( 575 const char *encoding /* Encoding name in standard form */ 576 ); 577 578/* --- Generic Codecs ----------------------------------------------------- */ 579 580/* Create a Unicode object by decoding the encoded string s of the 581 given size. */ 582 583extern DL_IMPORT(PyObject*) PyUnicode_Decode( 584 const char *s, /* encoded string */ 585 int size, /* size of buffer */ 586 const char *encoding, /* encoding */ 587 const char *errors /* error handling */ 588 ); 589 590/* Encodes a Py_UNICODE buffer of the given size and returns a 591 Python string object. */ 592 593extern DL_IMPORT(PyObject*) PyUnicode_Encode( 594 const Py_UNICODE *s, /* Unicode char buffer */ 595 int size, /* number of Py_UNICODE chars to encode */ 596 const char *encoding, /* encoding */ 597 const char *errors /* error handling */ 598 ); 599 600/* Encodes a Unicode object and returns the result as Python string 601 object. */ 602 603extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString( 604 PyObject *unicode, /* Unicode object */ 605 const char *encoding, /* encoding */ 606 const char *errors /* error handling */ 607 ); 608 609/* --- UTF-7 Codecs ------------------------------------------------------- */ 610 611extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF7( 612 const char *string, /* UTF-7 encoded string */ 613 int length, /* size of string */ 614 const char *errors /* error handling */ 615 ); 616 617extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF7( 618 const Py_UNICODE *data, /* Unicode char buffer */ 619 int length, /* number of Py_UNICODE chars to encode */ 620 int encodeSetO, /* force the encoder to encode characters in 621 Set O, as described in RFC2152 */ 622 int encodeWhiteSpace, /* force the encoder to encode space, tab, 623 carriage return and linefeed characters */ 624 const char *errors /* error handling */ 625 ); 626 627/* --- UTF-8 Codecs ------------------------------------------------------- */ 628 629extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8( 630 const char *string, /* UTF-8 encoded string */ 631 int length, /* size of string */ 632 const char *errors /* error handling */ 633 ); 634 635extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String( 636 PyObject *unicode /* Unicode object */ 637 ); 638 639extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( 640 const Py_UNICODE *data, /* Unicode char buffer */ 641 int length, /* number of Py_UNICODE chars to encode */ 642 const char *errors /* error handling */ 643 ); 644 645/* --- UTF-16 Codecs ------------------------------------------------------ */ 646 647/* Decodes length bytes from a UTF-16 encoded buffer string and returns 648 the corresponding Unicode object. 649 650 errors (if non-NULL) defines the error handling. It defaults 651 to "strict". 652 653 If byteorder is non-NULL, the decoder starts decoding using the 654 given byte order: 655 656 *byteorder == -1: little endian 657 *byteorder == 0: native order 658 *byteorder == 1: big endian 659 660 In native mode, the first two bytes of the stream are checked for a 661 BOM mark. If found, the BOM mark is analysed, the byte order 662 adjusted and the BOM skipped. In the other modes, no BOM mark 663 interpretation is done. After completion, *byteorder is set to the 664 current byte order at the end of input data. 665 666 If byteorder is NULL, the codec starts in native order mode. 667 668*/ 669 670extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16( 671 const char *string, /* UTF-16 encoded string */ 672 int length, /* size of string */ 673 const char *errors, /* error handling */ 674 int *byteorder /* pointer to byteorder to use 675 0=native;-1=LE,1=BE; updated on 676 exit */ 677 ); 678 679/* Returns a Python string using the UTF-16 encoding in native byte 680 order. The string always starts with a BOM mark. */ 681 682extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String( 683 PyObject *unicode /* Unicode object */ 684 ); 685 686/* Returns a Python string object holding the UTF-16 encoded value of 687 the Unicode data. 688 689 If byteorder is not 0, output is written according to the following 690 byte order: 691 692 byteorder == -1: little endian 693 byteorder == 0: native byte order (writes a BOM mark) 694 byteorder == 1: big endian 695 696 If byteorder is 0, the output string will always start with the 697 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 698 prepended. 699 700 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 701 UCS-2. This trick makes it possible to add full UTF-16 capabilities 702 at a later point without compromising the APIs. 703 704*/ 705 706extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16( 707 const Py_UNICODE *data, /* Unicode char buffer */ 708 int length, /* number of Py_UNICODE chars to encode */ 709 const char *errors, /* error handling */ 710 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 711 ); 712 713/* --- Unicode-Escape Codecs ---------------------------------------------- */ 714 715extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape( 716 const char *string, /* Unicode-Escape encoded string */ 717 int length, /* size of string */ 718 const char *errors /* error handling */ 719 ); 720 721extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString( 722 PyObject *unicode /* Unicode object */ 723 ); 724 725extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape( 726 const Py_UNICODE *data, /* Unicode char buffer */ 727 int length /* Number of Py_UNICODE chars to encode */ 728 ); 729 730/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 731 732extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 733 const char *string, /* Raw-Unicode-Escape encoded string */ 734 int length, /* size of string */ 735 const char *errors /* error handling */ 736 ); 737 738extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 739 PyObject *unicode /* Unicode object */ 740 ); 741 742extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 743 const Py_UNICODE *data, /* Unicode char buffer */ 744 int length /* Number of Py_UNICODE chars to encode */ 745 ); 746 747/* --- Latin-1 Codecs ----------------------------------------------------- 748 749 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 750 751*/ 752 753extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1( 754 const char *string, /* Latin-1 encoded string */ 755 int length, /* size of string */ 756 const char *errors /* error handling */ 757 ); 758 759extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String( 760 PyObject *unicode /* Unicode object */ 761 ); 762 763extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1( 764 const Py_UNICODE *data, /* Unicode char buffer */ 765 int length, /* Number of Py_UNICODE chars to encode */ 766 const char *errors /* error handling */ 767 ); 768 769/* --- ASCII Codecs ------------------------------------------------------- 770 771 Only 7-bit ASCII data is excepted. All other codes generate errors. 772 773*/ 774 775extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII( 776 const char *string, /* ASCII encoded string */ 777 int length, /* size of string */ 778 const char *errors /* error handling */ 779 ); 780 781extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString( 782 PyObject *unicode /* Unicode object */ 783 ); 784 785extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII( 786 const Py_UNICODE *data, /* Unicode char buffer */ 787 int length, /* Number of Py_UNICODE chars to encode */ 788 const char *errors /* error handling */ 789 ); 790 791/* --- Character Map Codecs ----------------------------------------------- 792 793 This codec uses mappings to encode and decode characters. 794 795 Decoding mappings must map single string characters to single 796 Unicode characters, integers (which are then interpreted as Unicode 797 ordinals) or None (meaning "undefined mapping" and causing an 798 error). 799 800 Encoding mappings must map single Unicode characters to single 801 string characters, integers (which are then interpreted as Latin-1 802 ordinals) or None (meaning "undefined mapping" and causing an 803 error). 804 805 If a character lookup fails with a LookupError, the character is 806 copied as-is meaning that its ordinal value will be interpreted as 807 Unicode or Latin-1 ordinal resp. Because of this mappings only need 808 to contain those mappings which map characters to different code 809 points. 810 811*/ 812 813extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap( 814 const char *string, /* Encoded string */ 815 int length, /* size of string */ 816 PyObject *mapping, /* character mapping 817 (char ordinal -> unicode ordinal) */ 818 const char *errors /* error handling */ 819 ); 820 821extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString( 822 PyObject *unicode, /* Unicode object */ 823 PyObject *mapping /* character mapping 824 (unicode ordinal -> char ordinal) */ 825 ); 826 827extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap( 828 const Py_UNICODE *data, /* Unicode char buffer */ 829 int length, /* Number of Py_UNICODE chars to encode */ 830 PyObject *mapping, /* character mapping 831 (unicode ordinal -> char ordinal) */ 832 const char *errors /* error handling */ 833 ); 834 835/* Translate a Py_UNICODE buffer of the given length by applying a 836 character mapping table to it and return the resulting Unicode 837 object. 838 839 The mapping table must map Unicode ordinal integers to Unicode 840 ordinal integers or None (causing deletion of the character). 841 842 Mapping tables may be dictionaries or sequences. Unmapped character 843 ordinals (ones which cause a LookupError) are left untouched and 844 are copied as-is. 845 846*/ 847 848extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap( 849 const Py_UNICODE *data, /* Unicode char buffer */ 850 int length, /* Number of Py_UNICODE chars to encode */ 851 PyObject *table, /* Translate table */ 852 const char *errors /* error handling */ 853 ); 854 855#ifdef MS_WIN32 856 857/* --- MBCS codecs for Windows -------------------------------------------- */ 858 859extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS( 860 const char *string, /* MBCS encoded string */ 861 int length, /* size of string */ 862 const char *errors /* error handling */ 863 ); 864 865extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString( 866 PyObject *unicode /* Unicode object */ 867 ); 868 869extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS( 870 const Py_UNICODE *data, /* Unicode char buffer */ 871 int length, /* Number of Py_UNICODE chars to encode */ 872 const char *errors /* error handling */ 873 ); 874 875#endif /* MS_WIN32 */ 876 877/* --- Decimal Encoder ---------------------------------------------------- */ 878 879/* Takes a Unicode string holding a decimal value and writes it into 880 an output buffer using standard ASCII digit codes. 881 882 The output buffer has to provide at least length+1 bytes of storage 883 area. The output string is 0-terminated. 884 885 The encoder converts whitespace to ' ', decimal characters to their 886 corresponding ASCII digit and all other Latin-1 characters except 887 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 888 are treated as errors. This includes embedded NULL bytes. 889 890 Error handling is defined by the errors argument: 891 892 NULL or "strict": raise a ValueError 893 "ignore": ignore the wrong characters (these are not copied to the 894 output buffer) 895 "replace": replaces illegal characters with '?' 896 897 Returns 0 on success, -1 on failure. 898 899*/ 900 901extern DL_IMPORT(int) PyUnicode_EncodeDecimal( 902 Py_UNICODE *s, /* Unicode buffer */ 903 int length, /* Number of Py_UNICODE chars to encode */ 904 char *output, /* Output buffer; must have size >= length */ 905 const char *errors /* error handling */ 906 ); 907 908/* --- Methods & Slots ---------------------------------------------------- 909 910 These are capable of handling Unicode objects and strings on input 911 (we refer to them as strings in the descriptions) and return 912 Unicode objects or integers as apporpriate. */ 913 914/* Concat two strings giving a new Unicode string. */ 915 916extern DL_IMPORT(PyObject*) PyUnicode_Concat( 917 PyObject *left, /* Left string */ 918 PyObject *right /* Right string */ 919 ); 920 921/* Split a string giving a list of Unicode strings. 922 923 If sep is NULL, splitting will be done at all whitespace 924 substrings. Otherwise, splits occur at the given separator. 925 926 At most maxsplit splits will be done. If negative, no limit is set. 927 928 Separators are not included in the resulting list. 929 930*/ 931 932extern DL_IMPORT(PyObject*) PyUnicode_Split( 933 PyObject *s, /* String to split */ 934 PyObject *sep, /* String separator */ 935 int maxsplit /* Maxsplit count */ 936 ); 937 938/* Dito, but split at line breaks. 939 940 CRLF is considered to be one line break. Line breaks are not 941 included in the resulting list. */ 942 943extern DL_IMPORT(PyObject*) PyUnicode_Splitlines( 944 PyObject *s, /* String to split */ 945 int keepends /* If true, line end markers are included */ 946 ); 947 948/* Translate a string by applying a character mapping table to it and 949 return the resulting Unicode object. 950 951 The mapping table must map Unicode ordinal integers to Unicode 952 ordinal integers or None (causing deletion of the character). 953 954 Mapping tables may be dictionaries or sequences. Unmapped character 955 ordinals (ones which cause a LookupError) are left untouched and 956 are copied as-is. 957 958*/ 959 960extern DL_IMPORT(PyObject *) PyUnicode_Translate( 961 PyObject *str, /* String */ 962 PyObject *table, /* Translate table */ 963 const char *errors /* error handling */ 964 ); 965 966/* Join a sequence of strings using the given separator and return 967 the resulting Unicode string. */ 968 969extern DL_IMPORT(PyObject*) PyUnicode_Join( 970 PyObject *separator, /* Separator string */ 971 PyObject *seq /* Sequence object */ 972 ); 973 974/* Return 1 if substr matches str[start:end] at the given tail end, 0 975 otherwise. */ 976 977extern DL_IMPORT(int) PyUnicode_Tailmatch( 978 PyObject *str, /* String */ 979 PyObject *substr, /* Prefix or Suffix string */ 980 int start, /* Start index */ 981 int end, /* Stop index */ 982 int direction /* Tail end: -1 prefix, +1 suffix */ 983 ); 984 985/* Return the first position of substr in str[start:end] using the 986 given search direction or -1 if not found. */ 987 988extern DL_IMPORT(int) PyUnicode_Find( 989 PyObject *str, /* String */ 990 PyObject *substr, /* Substring to find */ 991 int start, /* Start index */ 992 int end, /* Stop index */ 993 int direction /* Find direction: +1 forward, -1 backward */ 994 ); 995 996/* Count the number of occurrences of substr in str[start:end]. */ 997 998extern DL_IMPORT(int) PyUnicode_Count( 999 PyObject *str, /* String */ 1000 PyObject *substr, /* Substring to count */ 1001 int start, /* Start index */ 1002 int end /* Stop index */ 1003 ); 1004 1005/* Replace at most maxcount occurrences of substr in str with replstr 1006 and return the resulting Unicode object. */ 1007 1008extern DL_IMPORT(PyObject *) PyUnicode_Replace( 1009 PyObject *str, /* String */ 1010 PyObject *substr, /* Substring to find */ 1011 PyObject *replstr, /* Substring to replace */ 1012 int maxcount /* Max. number of replacements to apply; 1013 -1 = all */ 1014 ); 1015 1016/* Compare two strings and return -1, 0, 1 for less than, equal, 1017 greater than resp. */ 1018 1019extern DL_IMPORT(int) PyUnicode_Compare( 1020 PyObject *left, /* Left string */ 1021 PyObject *right /* Right string */ 1022 ); 1023 1024/* Apply a argument tuple or dictionary to a format string and return 1025 the resulting Unicode string. */ 1026 1027extern DL_IMPORT(PyObject *) PyUnicode_Format( 1028 PyObject *format, /* Format string */ 1029 PyObject *args /* Argument tuple or dictionary */ 1030 ); 1031 1032/* Checks whether element is contained in container and return 1/0 1033 accordingly. 1034 1035 element has to coerce to an one element Unicode string. -1 is 1036 returned in case of an error. */ 1037 1038extern DL_IMPORT(int) PyUnicode_Contains( 1039 PyObject *container, /* Container string */ 1040 PyObject *element /* Element string */ 1041 ); 1042 1043/* === Characters Type APIs =============================================== */ 1044 1045/* These should not be used directly. Use the Py_UNICODE_IS* and 1046 Py_UNICODE_TO* macros instead. 1047 1048 These APIs are implemented in Objects/unicodectype.c. 1049 1050*/ 1051 1052extern DL_IMPORT(int) _PyUnicode_IsLowercase( 1053 Py_UNICODE ch /* Unicode character */ 1054 ); 1055 1056extern DL_IMPORT(int) _PyUnicode_IsUppercase( 1057 Py_UNICODE ch /* Unicode character */ 1058 ); 1059 1060extern DL_IMPORT(int) _PyUnicode_IsTitlecase( 1061 Py_UNICODE ch /* Unicode character */ 1062 ); 1063 1064extern DL_IMPORT(int) _PyUnicode_IsWhitespace( 1065 Py_UNICODE ch /* Unicode character */ 1066 ); 1067 1068extern DL_IMPORT(int) _PyUnicode_IsLinebreak( 1069 Py_UNICODE ch /* Unicode character */ 1070 ); 1071 1072extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase( 1073 Py_UNICODE ch /* Unicode character */ 1074 ); 1075 1076extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase( 1077 Py_UNICODE ch /* Unicode character */ 1078 ); 1079 1080extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase( 1081 Py_UNICODE ch /* Unicode character */ 1082 ); 1083 1084extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit( 1085 Py_UNICODE ch /* Unicode character */ 1086 ); 1087 1088extern DL_IMPORT(int) _PyUnicode_ToDigit( 1089 Py_UNICODE ch /* Unicode character */ 1090 ); 1091 1092extern DL_IMPORT(double) _PyUnicode_ToNumeric( 1093 Py_UNICODE ch /* Unicode character */ 1094 ); 1095 1096extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit( 1097 Py_UNICODE ch /* Unicode character */ 1098 ); 1099 1100extern DL_IMPORT(int) _PyUnicode_IsDigit( 1101 Py_UNICODE ch /* Unicode character */ 1102 ); 1103 1104extern DL_IMPORT(int) _PyUnicode_IsNumeric( 1105 Py_UNICODE ch /* Unicode character */ 1106 ); 1107 1108extern DL_IMPORT(int) _PyUnicode_IsAlpha( 1109 Py_UNICODE ch /* Unicode character */ 1110 ); 1111 1112#ifdef __cplusplus 1113} 1114#endif 1115#endif /* Py_USING_UNICODE */ 1116#endif /* !Py_UNICODEOBJECT_H */ 1117