unicodeobject.h revision beb4135b8c81e1dbbb841ecd7355ab5a09a3edd2
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal (see file Misc/unicode.txt).
11
12Copyright (c) Corporation for National Research Initiatives.
13
14
15 Original header:
16 --------------------------------------------------------------------
17
18 * Yet another Unicode string type for Python.  This type supports the
19 * 16-bit Basic Multilingual Plane (BMP) only.
20 *
21 * Written by Fredrik Lundh, January 1999.
22 *
23 * Copyright (c) 1999 by Secret Labs AB.
24 * Copyright (c) 1999 by Fredrik Lundh.
25 *
26 * fredrik@pythonware.com
27 * http://www.pythonware.com
28 *
29 * --------------------------------------------------------------------
30 * This Unicode String Type is
31 *
32 * Copyright (c) 1999 by Secret Labs AB
33 * Copyright (c) 1999 by Fredrik Lundh
34 *
35 * By obtaining, using, and/or copying this software and/or its
36 * associated documentation, you agree that you have read, understood,
37 * and will comply with the following terms and conditions:
38 *
39 * Permission to use, copy, modify, and distribute this software and its
40 * associated documentation for any purpose and without fee is hereby
41 * granted, provided that the above copyright notice appears in all
42 * copies, and that both that copyright notice and this permission notice
43 * appear in supporting documentation, and that the name of Secret Labs
44 * AB or the author not be used in advertising or publicity pertaining to
45 * distribution of the software without specific, written prior
46 * permission.
47 *
48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
50 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
55 * -------------------------------------------------------------------- */
56
57#include <ctype.h>
58
59/* === Internal API ======================================================= */
60
61/* --- Internal Unicode Format -------------------------------------------- */
62
63/* Python 3.x requires unicode */
64#define Py_USING_UNICODE
65
66/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
67   properly set, but the default rules below doesn't set it.  I'll
68   sort this out some other day -- fredrik@pythonware.com */
69
70#ifndef Py_UNICODE_SIZE
71#error Must define Py_UNICODE_SIZE
72#endif
73
74/* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
75   strings are stored as UCS-2 (with limited support for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Defaults for various platforms */
87#ifndef PY_UNICODE_TYPE
88
89/* Windows has a usable wchar_t type (unless we're using UCS-4) */
90# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
91#  define HAVE_USABLE_WCHAR_T
92#  define PY_UNICODE_TYPE wchar_t
93# endif
94
95# if defined(Py_UNICODE_WIDE)
96#  define PY_UNICODE_TYPE Py_UCS4
97# endif
98
99#endif
100
101/* If the compiler provides a wchar_t type we try to support it
102   through the interface functions PyUnicode_FromWideChar(),
103   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
104
105#ifdef HAVE_USABLE_WCHAR_T
106# ifndef HAVE_WCHAR_H
107#  define HAVE_WCHAR_H
108# endif
109#endif
110
111#ifdef HAVE_WCHAR_H
112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
113# ifdef _HAVE_BSDI
114#  include <time.h>
115# endif
116#  include <wchar.h>
117#endif
118
119/*
120 * Use this typedef when you need to represent a UTF-16 surrogate pair
121 * as single unsigned integer.
122 */
123#if SIZEOF_INT >= 4
124typedef unsigned int Py_UCS4;
125#elif SIZEOF_LONG >= 4
126typedef unsigned long Py_UCS4;
127#endif
128
129/* Py_UNICODE is the native Unicode storage format (code unit) used by
130   Python and represents a single Unicode element in the Unicode
131   type. */
132
133typedef PY_UNICODE_TYPE Py_UNICODE;
134
135/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
136
137/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
138   produce different external names and thus cause import errors in
139   case Python interpreters and extensions with mixed compiled in
140   Unicode width assumptions are combined. */
141
142#ifndef Py_UNICODE_WIDE
143
144# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
145# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
146# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
147# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
148# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
149# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
150# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
151# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
152# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
153# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
154# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
155# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
156# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
157# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
158# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
159# define PyUnicode_AsWideCharString PyUnicodeUCS2_AsWideCharString
160# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
161# define PyUnicode_Compare PyUnicodeUCS2_Compare
162# define PyUnicode_CompareWithASCII PyUnicodeUCS2_CompareASCII
163# define PyUnicode_Concat PyUnicodeUCS2_Concat
164# define PyUnicode_Append PyUnicodeUCS2_Append
165# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
166# define PyUnicode_Contains PyUnicodeUCS2_Contains
167# define PyUnicode_Count PyUnicodeUCS2_Count
168# define PyUnicode_Decode PyUnicodeUCS2_Decode
169# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
170# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
171# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
172# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
173# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
174# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
175# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
176# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
177# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
178# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
179# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
180# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
181# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
182# define PyUnicode_Encode PyUnicodeUCS2_Encode
183# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
184# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
185# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
186# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
187# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
188# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
189# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
190# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
191# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
192# define PyUnicode_Find PyUnicodeUCS2_Find
193# define PyUnicode_Format PyUnicodeUCS2_Format
194# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
195# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
196# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
197# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
198# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
199# define PyUnicode_FromString PyUnicodeUCS2_FromString
200# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
201# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
202# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
203# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
204# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder
205# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
206# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
207# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
208# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
209# define PyUnicode_Join PyUnicodeUCS2_Join
210# define PyUnicode_Partition PyUnicodeUCS2_Partition
211# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
212# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
213# define PyUnicode_Replace PyUnicodeUCS2_Replace
214# define PyUnicode_Resize PyUnicodeUCS2_Resize
215# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
216# define PyUnicode_Split PyUnicodeUCS2_Split
217# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
218# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
219# define PyUnicode_Translate PyUnicodeUCS2_Translate
220# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
221# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
222# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
223# define _PyUnicode_Init _PyUnicodeUCS2_Init
224# define PyUnicode_strdup PyUnicodeUCS2_strdup
225
226#else
227
228# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
229# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
230# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
231# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
232# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
233# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
234# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
235# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
236# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
237# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
238# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
239# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
240# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
241# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
242# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
243# define PyUnicode_AsWideCharString PyUnicodeUCS4_AsWideCharString
244# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
245# define PyUnicode_Compare PyUnicodeUCS4_Compare
246# define PyUnicode_CompareWithASCII PyUnicodeUCS4_CompareWithASCII
247# define PyUnicode_Concat PyUnicodeUCS4_Concat
248# define PyUnicode_Append PyUnicodeUCS4_Append
249# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
250# define PyUnicode_Contains PyUnicodeUCS4_Contains
251# define PyUnicode_Count PyUnicodeUCS4_Count
252# define PyUnicode_Decode PyUnicodeUCS4_Decode
253# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
254# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
255# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
256# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
257# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
258# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
259# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
260# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
261# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
262# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
263# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
264# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
265# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
266# define PyUnicode_Encode PyUnicodeUCS4_Encode
267# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
268# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
269# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
270# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
271# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
272# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
273# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
274# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
275# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
276# define PyUnicode_Find PyUnicodeUCS4_Find
277# define PyUnicode_Format PyUnicodeUCS4_Format
278# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
279# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
280# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
281# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
282# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
283# define PyUnicode_FromString PyUnicodeUCS4_FromString
284# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
285# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
286# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
287# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
288# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder
289# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
290# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
291# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
292# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
293# define PyUnicode_Join PyUnicodeUCS4_Join
294# define PyUnicode_Partition PyUnicodeUCS4_Partition
295# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
296# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
297# define PyUnicode_Replace PyUnicodeUCS4_Replace
298# define PyUnicode_Resize PyUnicodeUCS4_Resize
299# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
300# define PyUnicode_Split PyUnicodeUCS4_Split
301# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
302# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
303# define PyUnicode_Translate PyUnicodeUCS4_Translate
304# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
305# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
306# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
307# define _PyUnicode_Init _PyUnicodeUCS4_Init
308# define PyUnicode_strdup PyUnicodeUCS4_strdup
309
310#endif
311
312/* --- Internal Unicode Operations ---------------------------------------- */
313
314/* Since splitting on whitespace is an important use case, and
315   whitespace in most situations is solely ASCII whitespace, we
316   optimize for the common case by using a quick look-up table
317   _Py_ascii_whitespace (see below) with an inlined check.
318
319 */
320#define Py_UNICODE_ISSPACE(ch) \
321    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
322
323#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
324#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
325#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
326#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
327
328#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
329#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
330#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
331
332#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
333#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
334#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
335#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
336
337#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
338#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
339#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
340
341#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
342
343#define Py_UNICODE_ISALNUM(ch) \
344       (Py_UNICODE_ISALPHA(ch) || \
345    Py_UNICODE_ISDECIMAL(ch) || \
346    Py_UNICODE_ISDIGIT(ch) || \
347    Py_UNICODE_ISNUMERIC(ch))
348
349#define Py_UNICODE_COPY(target, source, length)                         \
350    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
351
352#define Py_UNICODE_FILL(target, value, length) \
353    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
354    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
355    } while (0)
356
357/* Check if substring matches at given offset.  the offset must be
358   valid, and the substring must not be empty */
359
360#define Py_UNICODE_MATCH(string, offset, substring) \
361    ((*((string)->str + (offset)) == *((substring)->str)) && \
362    ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
363     !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
364
365#ifdef __cplusplus
366extern "C" {
367#endif
368
369/* --- Unicode Type ------------------------------------------------------- */
370
371typedef struct {
372    PyObject_HEAD
373    Py_ssize_t length;          /* Length of raw Unicode data in buffer */
374    Py_UNICODE *str;            /* Raw Unicode buffer */
375    long hash;                  /* Hash value; -1 if not set */
376    int state;                  /* != 0 if interned. In this case the two
377                                 * references from the dictionary to this object
378                                 * are *not* counted in ob_refcnt. */
379    PyObject *defenc;           /* (Default) Encoded version as Python
380                                   string, or NULL; this is used for
381                                   implementing the buffer protocol */
382} PyUnicodeObject;
383
384PyAPI_DATA(PyTypeObject) PyUnicode_Type;
385PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
386
387#define SSTATE_NOT_INTERNED 0
388#define SSTATE_INTERNED_MORTAL 1
389#define SSTATE_INTERNED_IMMORTAL 2
390
391#define PyUnicode_Check(op) \
392                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
393#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
394
395/* Fast access macros */
396#define PyUnicode_GET_SIZE(op) \
397    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
398#define PyUnicode_GET_DATA_SIZE(op) \
399    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
400#define PyUnicode_AS_UNICODE(op) \
401    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
402#define PyUnicode_AS_DATA(op) \
403    (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
404
405/* --- Constants ---------------------------------------------------------- */
406
407/* This Unicode character will be used as replacement character during
408   decoding if the errors argument is set to "replace". Note: the
409   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
410   Unicode 3.0. */
411
412#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
413
414/* === Public API ========================================================= */
415
416/* --- Plain Py_UNICODE --------------------------------------------------- */
417
418/* Create a Unicode Object from the Py_UNICODE buffer u of the given
419   size.
420
421   u may be NULL which causes the contents to be undefined. It is the
422   user's responsibility to fill in the needed data afterwards. Note
423   that modifying the Unicode object contents after construction is
424   only allowed if u was set to NULL.
425
426   The buffer is copied into the new object. */
427
428PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
429    const Py_UNICODE *u,        /* Unicode buffer */
430    Py_ssize_t size             /* size of buffer */
431    );
432
433/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
434PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
435    const char *u,        /* char buffer */
436    Py_ssize_t size       /* size of buffer */
437    );
438
439/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
440   UTF-8 encoded bytes */
441PyAPI_FUNC(PyObject*) PyUnicode_FromString(
442    const char *u        /* string */
443    );
444
445/* Return a read-only pointer to the Unicode object's internal
446   Py_UNICODE buffer. */
447
448PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
449    PyObject *unicode           /* Unicode object */
450    );
451
452/* Get the length of the Unicode object. */
453
454PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
455    PyObject *unicode           /* Unicode object */
456    );
457
458/* Get the maximum ordinal for a Unicode character. */
459PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
460
461/* Resize an already allocated Unicode object to the new size length.
462
463   *unicode is modified to point to the new (resized) object and 0
464   returned on success.
465
466   This API may only be called by the function which also called the
467   Unicode constructor. The refcount on the object must be 1. Otherwise,
468   an error is returned.
469
470   Error handling is implemented as follows: an exception is set, -1
471   is returned and *unicode left untouched.
472
473*/
474
475PyAPI_FUNC(int) PyUnicode_Resize(
476    PyObject **unicode,         /* Pointer to the Unicode object */
477    Py_ssize_t length           /* New length */
478    );
479
480/* Coerce obj to an Unicode object and return a reference with
481   *incremented* refcount.
482
483   Coercion is done in the following way:
484
485   1. bytes, bytearray and other char buffer compatible objects are decoded
486      under the assumptions that they contain data using the current
487      default encoding. Decoding is done in "strict" mode.
488
489   2. All other objects (including Unicode objects) raise an
490      exception.
491
492   The API returns NULL in case of an error. The caller is responsible
493   for decref'ing the returned objects.
494
495*/
496
497PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
498    register PyObject *obj,     /* Object */
499    const char *encoding,       /* encoding */
500    const char *errors          /* error handling */
501    );
502
503/* Coerce obj to an Unicode object and return a reference with
504   *incremented* refcount.
505
506   Unicode objects are passed back as-is (subclasses are converted to
507   true Unicode objects), all other objects are delegated to
508   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
509   using UTF-8 encoding as basis for decoding the object.
510
511   The API returns NULL in case of an error. The caller is responsible
512   for decref'ing the returned objects.
513
514*/
515
516PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
517    register PyObject *obj      /* Object */
518    );
519
520PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
521    const char *format,   /* ASCII-encoded string  */
522    va_list vargs
523    );
524PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
525    const char *format,   /* ASCII-encoded string  */
526    ...
527    );
528
529/* Format the object based on the format_spec, as defined in PEP 3101
530   (Advanced String Formatting). */
531PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
532                                                 Py_UNICODE *format_spec,
533                                                 Py_ssize_t format_spec_len);
534
535PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
536PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
537PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
538PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
539
540/* Use only if you know it's a string */
541#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
542
543/* --- wchar_t support for platforms which support it --------------------- */
544
545#ifdef HAVE_WCHAR_H
546
547/* Create a Unicode Object from the wchar_t buffer w of the given
548   size.
549
550   The buffer is copied into the new object. */
551
552PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
553    register const wchar_t *w,  /* wchar_t buffer */
554    Py_ssize_t size             /* size of buffer */
555    );
556
557/* Copies the Unicode Object contents into the wchar_t buffer w.  At
558   most size wchar_t characters are copied.
559
560   Note that the resulting wchar_t string may or may not be
561   0-terminated.  It is the responsibility of the caller to make sure
562   that the wchar_t string is 0-terminated in case this is required by
563   the application.
564
565   Returns the number of wchar_t characters copied (excluding a
566   possibly trailing 0-termination character) or -1 in case of an
567   error. */
568
569PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
570    PyUnicodeObject *unicode,   /* Unicode object */
571    register wchar_t *w,        /* wchar_t buffer */
572    Py_ssize_t size             /* size of buffer */
573    );
574
575/* Convert the Unicode object to a wide character string. The output string
576   always ends with a nul character. If size is not NULL, write the number of
577   wide characters (including the nul character) into *size.
578
579   Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
580   on success. On error, returns NULL, *size is undefined and raises a
581   MemoryError. */
582
583PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
584    PyObject *unicode,          /* Unicode object */
585    Py_ssize_t *size            /* number of characters of the result */
586    );
587
588#endif
589
590/* --- Unicode ordinals --------------------------------------------------- */
591
592/* Create a Unicode Object from the given Unicode code point ordinal.
593
594   The ordinal must be in range(0x10000) on narrow Python builds
595   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
596   raised in case it is not.
597
598*/
599
600PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
601
602/* --- Free-list management ----------------------------------------------- */
603
604/* Clear the free list used by the Unicode implementation.
605
606   This can be used to release memory used for objects on the free
607   list back to the Python memory allocator.
608
609*/
610
611PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
612
613/* === Builtin Codecs =====================================================
614
615   Many of these APIs take two arguments encoding and errors. These
616   parameters encoding and errors have the same semantics as the ones
617   of the builtin unicode() API.
618
619   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
620
621   Error handling is set by errors which may also be set to NULL
622   meaning to use the default handling defined for the codec. Default
623   error handling for all builtin codecs is "strict" (ValueErrors are
624   raised).
625
626   The codecs all use a similar interface. Only deviation from the
627   generic ones are documented.
628
629*/
630
631/* --- Manage the default encoding ---------------------------------------- */
632
633/* Return a Python string holding the default encoded value of the
634   Unicode object.
635
636   The resulting string is cached in the Unicode object for subsequent
637   usage by this function. The cached version is needed to implement
638   the character buffer interface and will live (at least) as long as
639   the Unicode object itself.
640
641   The refcount of the string is *not* incremented.
642
643   *** Exported for internal use by the interpreter only !!! ***
644
645*/
646
647PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
648    PyObject *unicode,
649    const char *errors);
650
651/* Returns a pointer to the default encoding (normally, UTF-8) of the
652   Unicode object unicode and the size of the encoded representation
653   in bytes stored in *size.
654
655   In case of an error, no *size is set.
656
657   *** This API is for interpreter INTERNAL USE ONLY and will likely
658   *** be removed or changed for Python 3.1.
659
660   *** If you need to access the Unicode object as UTF-8 bytes string,
661   *** please use PyUnicode_AsUTF8String() instead.
662
663*/
664
665PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
666    PyObject *unicode,
667    Py_ssize_t *size);
668
669/* Returns a pointer to the default encoding (normally, UTf-8) of the
670   Unicode object unicode.
671
672   Use of this API is DEPRECATED since no size information can be
673   extracted from the returned data.
674
675   *** This API is for interpreter INTERNAL USE ONLY and will likely
676   *** be removed or changed for Python 3.1.
677
678   *** If you need to access the Unicode object as UTF-8 bytes string,
679   *** please use PyUnicode_AsUTF8String() instead.
680
681*/
682
683PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode);
684
685/* Returns the currently active default encoding.
686
687   The default encoding is currently implemented as run-time settable
688   process global.  This may change in future versions of the
689   interpreter to become a parameter which is managed on a per-thread
690   basis.
691
692 */
693
694PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
695
696/* --- Generic Codecs ----------------------------------------------------- */
697
698/* Create a Unicode object by decoding the encoded string s of the
699   given size. */
700
701PyAPI_FUNC(PyObject*) PyUnicode_Decode(
702    const char *s,              /* encoded string */
703    Py_ssize_t size,            /* size of buffer */
704    const char *encoding,       /* encoding */
705    const char *errors          /* error handling */
706    );
707
708/* Decode a Unicode object unicode and return the result as Python
709   object. */
710
711PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
712    PyObject *unicode,          /* Unicode object */
713    const char *encoding,       /* encoding */
714    const char *errors          /* error handling */
715    );
716
717/* Decode a Unicode object unicode and return the result as Unicode
718   object. */
719
720PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
721    PyObject *unicode,          /* Unicode object */
722    const char *encoding,       /* encoding */
723    const char *errors          /* error handling */
724    );
725
726/* Encodes a Py_UNICODE buffer of the given size and returns a
727   Python string object. */
728
729PyAPI_FUNC(PyObject*) PyUnicode_Encode(
730    const Py_UNICODE *s,        /* Unicode char buffer */
731    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
732    const char *encoding,       /* encoding */
733    const char *errors          /* error handling */
734    );
735
736/* Encodes a Unicode object and returns the result as Python
737   object. */
738
739PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
740    PyObject *unicode,          /* Unicode object */
741    const char *encoding,       /* encoding */
742    const char *errors          /* error handling */
743    );
744
745/* Encodes a Unicode object and returns the result as Python string
746   object. */
747
748PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
749    PyObject *unicode,          /* Unicode object */
750    const char *encoding,       /* encoding */
751    const char *errors          /* error handling */
752    );
753
754/* Encodes a Unicode object and returns the result as Unicode
755   object. */
756
757PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
758    PyObject *unicode,          /* Unicode object */
759    const char *encoding,       /* encoding */
760    const char *errors          /* error handling */
761    );
762
763/* Build an encoding map. */
764
765PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
766    PyObject* string            /* 256 character map */
767   );
768
769/* --- UTF-7 Codecs ------------------------------------------------------- */
770
771PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
772    const char *string,         /* UTF-7 encoded string */
773    Py_ssize_t length,          /* size of string */
774    const char *errors          /* error handling */
775    );
776
777PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
778    const char *string,         /* UTF-7 encoded string */
779    Py_ssize_t length,          /* size of string */
780    const char *errors,         /* error handling */
781    Py_ssize_t *consumed        /* bytes consumed */
782    );
783
784PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
785    const Py_UNICODE *data,     /* Unicode char buffer */
786    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
787    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
788    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
789    const char *errors          /* error handling */
790    );
791
792/* --- UTF-8 Codecs ------------------------------------------------------- */
793
794PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
795    const char *string,         /* UTF-8 encoded string */
796    Py_ssize_t length,          /* size of string */
797    const char *errors          /* error handling */
798    );
799
800PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
801    const char *string,         /* UTF-8 encoded string */
802    Py_ssize_t length,          /* size of string */
803    const char *errors,         /* error handling */
804    Py_ssize_t *consumed        /* bytes consumed */
805    );
806
807PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
808    PyObject *unicode           /* Unicode object */
809    );
810
811PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
812    const Py_UNICODE *data,     /* Unicode char buffer */
813    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
814    const char *errors          /* error handling */
815    );
816
817/* --- UTF-32 Codecs ------------------------------------------------------ */
818
819/* Decodes length bytes from a UTF-32 encoded buffer string and returns
820   the corresponding Unicode object.
821
822   errors (if non-NULL) defines the error handling. It defaults
823   to "strict".
824
825   If byteorder is non-NULL, the decoder starts decoding using the
826   given byte order:
827
828    *byteorder == -1: little endian
829    *byteorder == 0:  native order
830    *byteorder == 1:  big endian
831
832   In native mode, the first four bytes of the stream are checked for a
833   BOM mark. If found, the BOM mark is analysed, the byte order
834   adjusted and the BOM skipped.  In the other modes, no BOM mark
835   interpretation is done. After completion, *byteorder is set to the
836   current byte order at the end of input data.
837
838   If byteorder is NULL, the codec starts in native order mode.
839
840*/
841
842PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
843    const char *string,         /* UTF-32 encoded string */
844    Py_ssize_t length,          /* size of string */
845    const char *errors,         /* error handling */
846    int *byteorder              /* pointer to byteorder to use
847                                   0=native;-1=LE,1=BE; updated on
848                                   exit */
849    );
850
851PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
852    const char *string,         /* UTF-32 encoded string */
853    Py_ssize_t length,          /* size of string */
854    const char *errors,         /* error handling */
855    int *byteorder,             /* pointer to byteorder to use
856                                   0=native;-1=LE,1=BE; updated on
857                                   exit */
858    Py_ssize_t *consumed        /* bytes consumed */
859    );
860
861/* Returns a Python string using the UTF-32 encoding in native byte
862   order. The string always starts with a BOM mark.  */
863
864PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
865    PyObject *unicode           /* Unicode object */
866    );
867
868/* Returns a Python string object holding the UTF-32 encoded value of
869   the Unicode data.
870
871   If byteorder is not 0, output is written according to the following
872   byte order:
873
874   byteorder == -1: little endian
875   byteorder == 0:  native byte order (writes a BOM mark)
876   byteorder == 1:  big endian
877
878   If byteorder is 0, the output string will always start with the
879   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
880   prepended.
881
882*/
883
884PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
885    const Py_UNICODE *data,     /* Unicode char buffer */
886    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
887    const char *errors,         /* error handling */
888    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
889    );
890
891/* --- UTF-16 Codecs ------------------------------------------------------ */
892
893/* Decodes length bytes from a UTF-16 encoded buffer string and returns
894   the corresponding Unicode object.
895
896   errors (if non-NULL) defines the error handling. It defaults
897   to "strict".
898
899   If byteorder is non-NULL, the decoder starts decoding using the
900   given byte order:
901
902    *byteorder == -1: little endian
903    *byteorder == 0:  native order
904    *byteorder == 1:  big endian
905
906   In native mode, the first two bytes of the stream are checked for a
907   BOM mark. If found, the BOM mark is analysed, the byte order
908   adjusted and the BOM skipped.  In the other modes, no BOM mark
909   interpretation is done. After completion, *byteorder is set to the
910   current byte order at the end of input data.
911
912   If byteorder is NULL, the codec starts in native order mode.
913
914*/
915
916PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
917    const char *string,         /* UTF-16 encoded string */
918    Py_ssize_t length,          /* size of string */
919    const char *errors,         /* error handling */
920    int *byteorder              /* pointer to byteorder to use
921                                   0=native;-1=LE,1=BE; updated on
922                                   exit */
923    );
924
925PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
926    const char *string,         /* UTF-16 encoded string */
927    Py_ssize_t length,          /* size of string */
928    const char *errors,         /* error handling */
929    int *byteorder,             /* pointer to byteorder to use
930                                   0=native;-1=LE,1=BE; updated on
931                                   exit */
932    Py_ssize_t *consumed        /* bytes consumed */
933    );
934
935/* Returns a Python string using the UTF-16 encoding in native byte
936   order. The string always starts with a BOM mark.  */
937
938PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
939    PyObject *unicode           /* Unicode object */
940    );
941
942/* Returns a Python string object holding the UTF-16 encoded value of
943   the Unicode data.
944
945   If byteorder is not 0, output is written according to the following
946   byte order:
947
948   byteorder == -1: little endian
949   byteorder == 0:  native byte order (writes a BOM mark)
950   byteorder == 1:  big endian
951
952   If byteorder is 0, the output string will always start with the
953   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
954   prepended.
955
956   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
957   UCS-2. This trick makes it possible to add full UTF-16 capabilities
958   at a later point without compromising the APIs.
959
960*/
961
962PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
963    const Py_UNICODE *data,     /* Unicode char buffer */
964    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
965    const char *errors,         /* error handling */
966    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
967    );
968
969/* --- Unicode-Escape Codecs ---------------------------------------------- */
970
971PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
972    const char *string,         /* Unicode-Escape encoded string */
973    Py_ssize_t length,          /* size of string */
974    const char *errors          /* error handling */
975    );
976
977PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
978    PyObject *unicode           /* Unicode object */
979    );
980
981PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
982    const Py_UNICODE *data,     /* Unicode char buffer */
983    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
984    );
985
986/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
987
988PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
989    const char *string,         /* Raw-Unicode-Escape encoded string */
990    Py_ssize_t length,          /* size of string */
991    const char *errors          /* error handling */
992    );
993
994PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
995    PyObject *unicode           /* Unicode object */
996    );
997
998PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
999    const Py_UNICODE *data,     /* Unicode char buffer */
1000    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1001    );
1002
1003/* --- Unicode Internal Codec ---------------------------------------------
1004
1005    Only for internal use in _codecsmodule.c */
1006
1007PyObject *_PyUnicode_DecodeUnicodeInternal(
1008    const char *string,
1009    Py_ssize_t length,
1010    const char *errors
1011    );
1012
1013/* --- Latin-1 Codecs -----------------------------------------------------
1014
1015   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1016
1017*/
1018
1019PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1020    const char *string,         /* Latin-1 encoded string */
1021    Py_ssize_t length,          /* size of string */
1022    const char *errors          /* error handling */
1023    );
1024
1025PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1026    PyObject *unicode           /* Unicode object */
1027    );
1028
1029PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1030    const Py_UNICODE *data,     /* Unicode char buffer */
1031    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1032    const char *errors          /* error handling */
1033    );
1034
1035/* --- ASCII Codecs -------------------------------------------------------
1036
1037   Only 7-bit ASCII data is excepted. All other codes generate errors.
1038
1039*/
1040
1041PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1042    const char *string,         /* ASCII encoded string */
1043    Py_ssize_t length,          /* size of string */
1044    const char *errors          /* error handling */
1045    );
1046
1047PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1048    PyObject *unicode           /* Unicode object */
1049    );
1050
1051PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1052    const Py_UNICODE *data,     /* Unicode char buffer */
1053    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1054    const char *errors          /* error handling */
1055    );
1056
1057/* --- Character Map Codecs -----------------------------------------------
1058
1059   This codec uses mappings to encode and decode characters.
1060
1061   Decoding mappings must map single string characters to single
1062   Unicode characters, integers (which are then interpreted as Unicode
1063   ordinals) or None (meaning "undefined mapping" and causing an
1064   error).
1065
1066   Encoding mappings must map single Unicode characters to single
1067   string characters, integers (which are then interpreted as Latin-1
1068   ordinals) or None (meaning "undefined mapping" and causing an
1069   error).
1070
1071   If a character lookup fails with a LookupError, the character is
1072   copied as-is meaning that its ordinal value will be interpreted as
1073   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1074   to contain those mappings which map characters to different code
1075   points.
1076
1077*/
1078
1079PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1080    const char *string,         /* Encoded string */
1081    Py_ssize_t length,          /* size of string */
1082    PyObject *mapping,          /* character mapping
1083                                   (char ordinal -> unicode ordinal) */
1084    const char *errors          /* error handling */
1085    );
1086
1087PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1088    PyObject *unicode,          /* Unicode object */
1089    PyObject *mapping           /* character mapping
1090                                   (unicode ordinal -> char ordinal) */
1091    );
1092
1093PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1094    const Py_UNICODE *data,     /* Unicode char buffer */
1095    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1096    PyObject *mapping,          /* character mapping
1097                                   (unicode ordinal -> char ordinal) */
1098    const char *errors          /* error handling */
1099    );
1100
1101/* Translate a Py_UNICODE buffer of the given length by applying a
1102   character mapping table to it and return the resulting Unicode
1103   object.
1104
1105   The mapping table must map Unicode ordinal integers to Unicode
1106   ordinal integers or None (causing deletion of the character).
1107
1108   Mapping tables may be dictionaries or sequences. Unmapped character
1109   ordinals (ones which cause a LookupError) are left untouched and
1110   are copied as-is.
1111
1112*/
1113
1114PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1115    const Py_UNICODE *data,     /* Unicode char buffer */
1116    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1117    PyObject *table,            /* Translate table */
1118    const char *errors          /* error handling */
1119    );
1120
1121#ifdef MS_WIN32
1122
1123/* --- MBCS codecs for Windows -------------------------------------------- */
1124
1125PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1126    const char *string,         /* MBCS encoded string */
1127    Py_ssize_t length,              /* size of string */
1128    const char *errors          /* error handling */
1129    );
1130
1131PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1132    const char *string,         /* MBCS encoded string */
1133    Py_ssize_t length,          /* size of string */
1134    const char *errors,         /* error handling */
1135    Py_ssize_t *consumed        /* bytes consumed */
1136    );
1137
1138PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1139    PyObject *unicode           /* Unicode object */
1140    );
1141
1142PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1143    const Py_UNICODE *data,     /* Unicode char buffer */
1144    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1145    const char *errors          /* error handling */
1146    );
1147
1148#endif /* MS_WIN32 */
1149
1150/* --- Decimal Encoder ---------------------------------------------------- */
1151
1152/* Takes a Unicode string holding a decimal value and writes it into
1153   an output buffer using standard ASCII digit codes.
1154
1155   The output buffer has to provide at least length+1 bytes of storage
1156   area. The output string is 0-terminated.
1157
1158   The encoder converts whitespace to ' ', decimal characters to their
1159   corresponding ASCII digit and all other Latin-1 characters except
1160   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1161   are treated as errors. This includes embedded NULL bytes.
1162
1163   Error handling is defined by the errors argument:
1164
1165      NULL or "strict": raise a ValueError
1166      "ignore": ignore the wrong characters (these are not copied to the
1167                output buffer)
1168      "replace": replaces illegal characters with '?'
1169
1170   Returns 0 on success, -1 on failure.
1171
1172*/
1173
1174PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1175    Py_UNICODE *s,              /* Unicode buffer */
1176    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1177    char *output,               /* Output buffer; must have size >= length */
1178    const char *errors          /* error handling */
1179    );
1180
1181/* --- File system encoding ---------------------------------------------- */
1182
1183/* ParseTuple converter: encode str objects to bytes using
1184   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1185
1186PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1187
1188/* ParseTuple converter: decode bytes objects to unicode using
1189   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1190
1191PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1192
1193/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1194   and the "surrogateescape" error handler.
1195
1196   If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1197
1198   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1199*/
1200
1201PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1202    const char *s               /* encoded string */
1203    );
1204
1205/* Decode a string using Py_FileSystemDefaultEncoding
1206   and the "surrogateescape" error handler.
1207
1208   If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1209*/
1210
1211PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1212    const char *s,               /* encoded string */
1213    Py_ssize_t size              /* size */
1214    );
1215
1216/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1217   "surrogateescape" error handler, and return bytes.
1218
1219   If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1220*/
1221
1222PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1223    PyObject *unicode
1224    );
1225
1226/* --- Methods & Slots ----------------------------------------------------
1227
1228   These are capable of handling Unicode objects and strings on input
1229   (we refer to them as strings in the descriptions) and return
1230   Unicode objects or integers as apporpriate. */
1231
1232/* Concat two strings giving a new Unicode string. */
1233
1234PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1235    PyObject *left,             /* Left string */
1236    PyObject *right             /* Right string */
1237    );
1238
1239/* Concat two strings and put the result in *pleft
1240   (sets *pleft to NULL on error) */
1241
1242PyAPI_FUNC(void) PyUnicode_Append(
1243    PyObject **pleft,           /* Pointer to left string */
1244    PyObject *right             /* Right string */
1245    );
1246
1247/* Concat two strings, put the result in *pleft and drop the right object
1248   (sets *pleft to NULL on error) */
1249
1250PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1251    PyObject **pleft,           /* Pointer to left string */
1252    PyObject *right             /* Right string */
1253    );
1254
1255/* Split a string giving a list of Unicode strings.
1256
1257   If sep is NULL, splitting will be done at all whitespace
1258   substrings. Otherwise, splits occur at the given separator.
1259
1260   At most maxsplit splits will be done. If negative, no limit is set.
1261
1262   Separators are not included in the resulting list.
1263
1264*/
1265
1266PyAPI_FUNC(PyObject*) PyUnicode_Split(
1267    PyObject *s,                /* String to split */
1268    PyObject *sep,              /* String separator */
1269    Py_ssize_t maxsplit         /* Maxsplit count */
1270    );
1271
1272/* Dito, but split at line breaks.
1273
1274   CRLF is considered to be one line break. Line breaks are not
1275   included in the resulting list. */
1276
1277PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1278    PyObject *s,                /* String to split */
1279    int keepends                /* If true, line end markers are included */
1280    );
1281
1282/* Partition a string using a given separator. */
1283
1284PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1285    PyObject *s,                /* String to partition */
1286    PyObject *sep               /* String separator */
1287    );
1288
1289/* Partition a string using a given separator, searching from the end of the
1290   string. */
1291
1292PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1293    PyObject *s,                /* String to partition */
1294    PyObject *sep               /* String separator */
1295    );
1296
1297/* Split a string giving a list of Unicode strings.
1298
1299   If sep is NULL, splitting will be done at all whitespace
1300   substrings. Otherwise, splits occur at the given separator.
1301
1302   At most maxsplit splits will be done. But unlike PyUnicode_Split
1303   PyUnicode_RSplit splits from the end of the string. If negative,
1304   no limit is set.
1305
1306   Separators are not included in the resulting list.
1307
1308*/
1309
1310PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1311    PyObject *s,                /* String to split */
1312    PyObject *sep,              /* String separator */
1313    Py_ssize_t maxsplit         /* Maxsplit count */
1314    );
1315
1316/* Translate a string by applying a character mapping table to it and
1317   return the resulting Unicode object.
1318
1319   The mapping table must map Unicode ordinal integers to Unicode
1320   ordinal integers or None (causing deletion of the character).
1321
1322   Mapping tables may be dictionaries or sequences. Unmapped character
1323   ordinals (ones which cause a LookupError) are left untouched and
1324   are copied as-is.
1325
1326*/
1327
1328PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1329    PyObject *str,              /* String */
1330    PyObject *table,            /* Translate table */
1331    const char *errors          /* error handling */
1332    );
1333
1334/* Join a sequence of strings using the given separator and return
1335   the resulting Unicode string. */
1336
1337PyAPI_FUNC(PyObject*) PyUnicode_Join(
1338    PyObject *separator,        /* Separator string */
1339    PyObject *seq               /* Sequence object */
1340    );
1341
1342/* Return 1 if substr matches str[start:end] at the given tail end, 0
1343   otherwise. */
1344
1345PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1346    PyObject *str,              /* String */
1347    PyObject *substr,           /* Prefix or Suffix string */
1348    Py_ssize_t start,           /* Start index */
1349    Py_ssize_t end,             /* Stop index */
1350    int direction               /* Tail end: -1 prefix, +1 suffix */
1351    );
1352
1353/* Return the first position of substr in str[start:end] using the
1354   given search direction or -1 if not found. -2 is returned in case
1355   an error occurred and an exception is set. */
1356
1357PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1358    PyObject *str,              /* String */
1359    PyObject *substr,           /* Substring to find */
1360    Py_ssize_t start,           /* Start index */
1361    Py_ssize_t end,             /* Stop index */
1362    int direction               /* Find direction: +1 forward, -1 backward */
1363    );
1364
1365/* Count the number of occurrences of substr in str[start:end]. */
1366
1367PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1368    PyObject *str,              /* String */
1369    PyObject *substr,           /* Substring to count */
1370    Py_ssize_t start,           /* Start index */
1371    Py_ssize_t end              /* Stop index */
1372    );
1373
1374/* Replace at most maxcount occurrences of substr in str with replstr
1375   and return the resulting Unicode object. */
1376
1377PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1378    PyObject *str,              /* String */
1379    PyObject *substr,           /* Substring to find */
1380    PyObject *replstr,          /* Substring to replace */
1381    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1382                                   -1 = all */
1383    );
1384
1385/* Compare two strings and return -1, 0, 1 for less than, equal,
1386   greater than resp. */
1387
1388PyAPI_FUNC(int) PyUnicode_Compare(
1389    PyObject *left,             /* Left string */
1390    PyObject *right             /* Right string */
1391    );
1392
1393PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1394    PyObject *left,
1395    const char *right
1396    );
1397
1398/* Rich compare two strings and return one of the following:
1399
1400   - NULL in case an exception was raised
1401   - Py_True or Py_False for successfuly comparisons
1402   - Py_NotImplemented in case the type combination is unknown
1403
1404   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1405   case the conversion of the arguments to Unicode fails with a
1406   UnicodeDecodeError.
1407
1408   Possible values for op:
1409
1410     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1411
1412*/
1413
1414PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1415    PyObject *left,             /* Left string */
1416    PyObject *right,            /* Right string */
1417    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1418    );
1419
1420/* Apply a argument tuple or dictionary to a format string and return
1421   the resulting Unicode string. */
1422
1423PyAPI_FUNC(PyObject *) PyUnicode_Format(
1424    PyObject *format,           /* Format string */
1425    PyObject *args              /* Argument tuple or dictionary */
1426    );
1427
1428/* Checks whether element is contained in container and return 1/0
1429   accordingly.
1430
1431   element has to coerce to an one element Unicode string. -1 is
1432   returned in case of an error. */
1433
1434PyAPI_FUNC(int) PyUnicode_Contains(
1435    PyObject *container,        /* Container string */
1436    PyObject *element           /* Element string */
1437    );
1438
1439/* Checks whether argument is a valid identifier. */
1440
1441PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1442
1443/* Externally visible for str.strip(unicode) */
1444PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1445    PyUnicodeObject *self,
1446    int striptype,
1447    PyObject *sepobj
1448    );
1449
1450/* Using the current locale, insert the thousands grouping
1451   into the string pointed to by buffer.  For the argument descriptions,
1452   see Objects/stringlib/localeutil.h */
1453
1454PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1455                                                   Py_ssize_t n_buffer,
1456                                                   Py_UNICODE *digits,
1457                                                   Py_ssize_t n_digits,
1458                                                   Py_ssize_t min_width);
1459
1460/* Using explicit passed-in values, insert the thousands grouping
1461   into the string pointed to by buffer.  For the argument descriptions,
1462   see Objects/stringlib/localeutil.h */
1463PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer,
1464                                                   Py_ssize_t n_buffer,
1465                                                   Py_UNICODE *digits,
1466                                                   Py_ssize_t n_digits,
1467                                                   Py_ssize_t min_width,
1468                                                   const char *grouping,
1469                                                   const char *thousands_sep);
1470/* === Characters Type APIs =============================================== */
1471
1472/* Helper array used by Py_UNICODE_ISSPACE(). */
1473
1474PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1475
1476/* These should not be used directly. Use the Py_UNICODE_IS* and
1477   Py_UNICODE_TO* macros instead.
1478
1479   These APIs are implemented in Objects/unicodectype.c.
1480
1481*/
1482
1483PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1484    Py_UCS4 ch       /* Unicode character */
1485    );
1486
1487PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1488    Py_UCS4 ch       /* Unicode character */
1489    );
1490
1491PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1492    Py_UCS4 ch       /* Unicode character */
1493    );
1494
1495PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1496    Py_UCS4 ch       /* Unicode character */
1497    );
1498
1499PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1500    Py_UCS4 ch       /* Unicode character */
1501    );
1502
1503PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1504    const Py_UCS4 ch         /* Unicode character */
1505    );
1506
1507PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1508    const Py_UCS4 ch         /* Unicode character */
1509    );
1510
1511PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1512    Py_UCS4 ch       /* Unicode character */
1513    );
1514
1515PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1516    Py_UCS4 ch       /* Unicode character */
1517    );
1518
1519PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1520    Py_UCS4 ch       /* Unicode character */
1521    );
1522
1523PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1524    Py_UCS4 ch       /* Unicode character */
1525    );
1526
1527PyAPI_FUNC(int) _PyUnicode_ToDigit(
1528    Py_UCS4 ch       /* Unicode character */
1529    );
1530
1531PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1532    Py_UCS4 ch       /* Unicode character */
1533    );
1534
1535PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1536    Py_UCS4 ch       /* Unicode character */
1537    );
1538
1539PyAPI_FUNC(int) _PyUnicode_IsDigit(
1540    Py_UCS4 ch       /* Unicode character */
1541    );
1542
1543PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1544    Py_UCS4 ch       /* Unicode character */
1545    );
1546
1547PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1548    Py_UCS4 ch       /* Unicode character */
1549    );
1550
1551PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1552    Py_UCS4 ch       /* Unicode character */
1553    );
1554
1555PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1556    const Py_UNICODE *u
1557    );
1558
1559PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1560    Py_UNICODE *s1,
1561    const Py_UNICODE *s2);
1562
1563PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1564    Py_UNICODE *s1, const Py_UNICODE *s2);
1565
1566PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1567    Py_UNICODE *s1,
1568    const Py_UNICODE *s2,
1569    size_t n);
1570
1571PyAPI_FUNC(int) Py_UNICODE_strcmp(
1572    const Py_UNICODE *s1,
1573    const Py_UNICODE *s2
1574    );
1575
1576PyAPI_FUNC(int) Py_UNICODE_strncmp(
1577    const Py_UNICODE *s1,
1578    const Py_UNICODE *s2,
1579    size_t n
1580    );
1581
1582PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1583    const Py_UNICODE *s,
1584    Py_UNICODE c
1585    );
1586
1587PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
1588    const Py_UNICODE *s,
1589    Py_UNICODE c
1590    );
1591
1592/* Create a copy of a unicode string ending with a nul character. Return NULL
1593   and raise a MemoryError exception on memory allocation failure, otherwise
1594   return a new allocated buffer (use PyMem_Free() to free the buffer). */
1595
1596PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
1597    PyObject *unicode
1598    );
1599
1600#ifdef __cplusplus
1601}
1602#endif
1603#endif /* !Py_UNICODEOBJECT_H */
1604