unicodeobject.h revision feb7307db4b4582af9ac01719f7df651c2eed077
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal (see file Misc/unicode.txt).
11
12Copyright (c) Corporation for National Research Initiatives.
13
14
15 Original header:
16 --------------------------------------------------------------------
17
18 * Yet another Unicode string type for Python.  This type supports the
19 * 16-bit Basic Multilingual Plane (BMP) only.
20 *
21 * Written by Fredrik Lundh, January 1999.
22 *
23 * Copyright (c) 1999 by Secret Labs AB.
24 * Copyright (c) 1999 by Fredrik Lundh.
25 *
26 * fredrik@pythonware.com
27 * http://www.pythonware.com
28 *
29 * --------------------------------------------------------------------
30 * This Unicode String Type is
31 *
32 * Copyright (c) 1999 by Secret Labs AB
33 * Copyright (c) 1999 by Fredrik Lundh
34 *
35 * By obtaining, using, and/or copying this software and/or its
36 * associated documentation, you agree that you have read, understood,
37 * and will comply with the following terms and conditions:
38 *
39 * Permission to use, copy, modify, and distribute this software and its
40 * associated documentation for any purpose and without fee is hereby
41 * granted, provided that the above copyright notice appears in all
42 * copies, and that both that copyright notice and this permission notice
43 * appear in supporting documentation, and that the name of Secret Labs
44 * AB or the author not be used in advertising or publicity pertaining to
45 * distribution of the software without specific, written prior
46 * permission.
47 *
48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
50 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
55 * -------------------------------------------------------------------- */
56
57#include <ctype.h>
58
59/* === Internal API ======================================================= */
60
61/* --- Internal Unicode Format -------------------------------------------- */
62
63/* Python 3.x requires unicode */
64#define Py_USING_UNICODE
65
66/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
67   properly set, but the default rules below doesn't set it.  I'll
68   sort this out some other day -- fredrik@pythonware.com */
69
70#ifndef Py_UNICODE_SIZE
71#error Must define Py_UNICODE_SIZE
72#endif
73
74/* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
75   strings are stored as UCS-2 (with limited support for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Defaults for various platforms */
87#ifndef PY_UNICODE_TYPE
88
89/* Windows has a usable wchar_t type (unless we're using UCS-4) */
90# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
91#  define HAVE_USABLE_WCHAR_T
92#  define PY_UNICODE_TYPE wchar_t
93# endif
94
95# if defined(Py_UNICODE_WIDE)
96#  define PY_UNICODE_TYPE Py_UCS4
97# endif
98
99#endif
100
101/* If the compiler provides a wchar_t type we try to support it
102   through the interface functions PyUnicode_FromWideChar() and
103   PyUnicode_AsWideChar(). */
104
105#ifdef HAVE_USABLE_WCHAR_T
106# ifndef HAVE_WCHAR_H
107#  define HAVE_WCHAR_H
108# endif
109#endif
110
111#ifdef HAVE_WCHAR_H
112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
113# ifdef _HAVE_BSDI
114#  include <time.h>
115# endif
116#  include <wchar.h>
117#endif
118
119/*
120 * Use this typedef when you need to represent a UTF-16 surrogate pair
121 * as single unsigned integer.
122 */
123#if SIZEOF_INT >= 4
124typedef unsigned int Py_UCS4;
125#elif SIZEOF_LONG >= 4
126typedef unsigned long Py_UCS4;
127#endif
128
129/* Py_UNICODE is the native Unicode storage format (code unit) used by
130   Python and represents a single Unicode element in the Unicode
131   type. */
132
133typedef PY_UNICODE_TYPE Py_UNICODE;
134
135/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
136
137/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
138   produce different external names and thus cause import errors in
139   case Python interpreters and extensions with mixed compiled in
140   Unicode width assumptions are combined. */
141
142#ifndef Py_UNICODE_WIDE
143
144# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
145# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
146# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
147# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
148# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
149# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
150# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
151# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
152# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
153# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
154# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
155# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
156# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
157# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
158# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
159# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
160# define PyUnicode_Compare PyUnicodeUCS2_Compare
161# define PyUnicode_CompareWithASCII PyUnicodeUCS2_CompareASCII
162# define PyUnicode_Concat PyUnicodeUCS2_Concat
163# define PyUnicode_Append PyUnicodeUCS2_Append
164# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
165# define PyUnicode_Contains PyUnicodeUCS2_Contains
166# define PyUnicode_Count PyUnicodeUCS2_Count
167# define PyUnicode_Decode PyUnicodeUCS2_Decode
168# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
169# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
170# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
171# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
172# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
173# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
174# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
175# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
176# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
177# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
178# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
179# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
180# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
181# define PyUnicode_Encode PyUnicodeUCS2_Encode
182# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
183# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
184# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
185# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
186# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
187# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
188# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
189# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
190# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
191# define PyUnicode_Find PyUnicodeUCS2_Find
192# define PyUnicode_Format PyUnicodeUCS2_Format
193# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
194# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
195# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
196# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
197# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
198# define PyUnicode_FromString PyUnicodeUCS2_FromString
199# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
200# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
201# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
202# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
203# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder
204# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
205# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
206# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
207# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
208# define PyUnicode_Join PyUnicodeUCS2_Join
209# define PyUnicode_Partition PyUnicodeUCS2_Partition
210# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
211# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
212# define PyUnicode_Replace PyUnicodeUCS2_Replace
213# define PyUnicode_Resize PyUnicodeUCS2_Resize
214# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
215# define PyUnicode_Split PyUnicodeUCS2_Split
216# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
217# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
218# define PyUnicode_Translate PyUnicodeUCS2_Translate
219# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
220# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
221# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
222# define _PyUnicode_Init _PyUnicodeUCS2_Init
223# define PyUnicode_strdup PyUnicodeUCS2_strdup
224
225#else
226
227# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
228# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
229# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
230# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
231# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
232# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
233# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
234# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
235# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
236# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
237# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
238# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
239# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
240# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
241# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
242# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
243# define PyUnicode_Compare PyUnicodeUCS4_Compare
244# define PyUnicode_CompareWithASCII PyUnicodeUCS4_CompareWithASCII
245# define PyUnicode_Concat PyUnicodeUCS4_Concat
246# define PyUnicode_Append PyUnicodeUCS4_Append
247# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
248# define PyUnicode_Contains PyUnicodeUCS4_Contains
249# define PyUnicode_Count PyUnicodeUCS4_Count
250# define PyUnicode_Decode PyUnicodeUCS4_Decode
251# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
252# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
253# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
254# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
255# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
256# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
257# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
258# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
259# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
260# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
261# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
262# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
263# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
264# define PyUnicode_Encode PyUnicodeUCS4_Encode
265# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
266# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
267# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
268# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
269# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
270# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
271# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
272# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
273# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
274# define PyUnicode_Find PyUnicodeUCS4_Find
275# define PyUnicode_Format PyUnicodeUCS4_Format
276# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
277# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
278# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
279# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
280# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
281# define PyUnicode_FromString PyUnicodeUCS4_FromString
282# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
283# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
284# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
285# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
286# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder
287# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
288# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
289# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
290# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
291# define PyUnicode_Join PyUnicodeUCS4_Join
292# define PyUnicode_Partition PyUnicodeUCS4_Partition
293# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
294# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
295# define PyUnicode_Replace PyUnicodeUCS4_Replace
296# define PyUnicode_Resize PyUnicodeUCS4_Resize
297# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
298# define PyUnicode_Split PyUnicodeUCS4_Split
299# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
300# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
301# define PyUnicode_Translate PyUnicodeUCS4_Translate
302# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
303# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
304# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
305# define _PyUnicode_Init _PyUnicodeUCS4_Init
306# define PyUnicode_strdup PyUnicodeUCS4_strdup
307
308#endif
309
310/* --- Internal Unicode Operations ---------------------------------------- */
311
312/* Since splitting on whitespace is an important use case, and
313   whitespace in most situations is solely ASCII whitespace, we
314   optimize for the common case by using a quick look-up table
315   _Py_ascii_whitespace (see below) with an inlined check.
316
317 */
318#define Py_UNICODE_ISSPACE(ch) \
319    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
320
321#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
322#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
323#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
324#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
325
326#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
327#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
328#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
329
330#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
331#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
332#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
333#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
334
335#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
336#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
337#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
338
339#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
340
341#define Py_UNICODE_ISALNUM(ch) \
342       (Py_UNICODE_ISALPHA(ch) || \
343    Py_UNICODE_ISDECIMAL(ch) || \
344    Py_UNICODE_ISDIGIT(ch) || \
345    Py_UNICODE_ISNUMERIC(ch))
346
347#define Py_UNICODE_COPY(target, source, length)                         \
348    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
349
350#define Py_UNICODE_FILL(target, value, length) \
351    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
352    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
353    } while (0)
354
355/* Check if substring matches at given offset.  the offset must be
356   valid, and the substring must not be empty */
357
358#define Py_UNICODE_MATCH(string, offset, substring) \
359    ((*((string)->str + (offset)) == *((substring)->str)) && \
360    ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
361     !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
362
363#ifdef __cplusplus
364extern "C" {
365#endif
366
367/* --- Unicode Type ------------------------------------------------------- */
368
369typedef struct {
370    PyObject_HEAD
371    Py_ssize_t length;          /* Length of raw Unicode data in buffer */
372    Py_UNICODE *str;            /* Raw Unicode buffer */
373    long hash;                  /* Hash value; -1 if not set */
374    int state;                  /* != 0 if interned. In this case the two
375                                 * references from the dictionary to this object
376                                 * are *not* counted in ob_refcnt. */
377    PyObject *defenc;           /* (Default) Encoded version as Python
378                                   string, or NULL; this is used for
379                                   implementing the buffer protocol */
380} PyUnicodeObject;
381
382PyAPI_DATA(PyTypeObject) PyUnicode_Type;
383PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
384
385#define SSTATE_NOT_INTERNED 0
386#define SSTATE_INTERNED_MORTAL 1
387#define SSTATE_INTERNED_IMMORTAL 2
388
389#define PyUnicode_Check(op) \
390                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
391#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
392
393/* Fast access macros */
394#define PyUnicode_GET_SIZE(op) \
395    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
396#define PyUnicode_GET_DATA_SIZE(op) \
397    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
398#define PyUnicode_AS_UNICODE(op) \
399    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
400#define PyUnicode_AS_DATA(op) \
401    (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
402
403/* --- Constants ---------------------------------------------------------- */
404
405/* This Unicode character will be used as replacement character during
406   decoding if the errors argument is set to "replace". Note: the
407   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
408   Unicode 3.0. */
409
410#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
411
412/* === Public API ========================================================= */
413
414/* --- Plain Py_UNICODE --------------------------------------------------- */
415
416/* Create a Unicode Object from the Py_UNICODE buffer u of the given
417   size.
418
419   u may be NULL which causes the contents to be undefined. It is the
420   user's responsibility to fill in the needed data afterwards. Note
421   that modifying the Unicode object contents after construction is
422   only allowed if u was set to NULL.
423
424   The buffer is copied into the new object. */
425
426PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
427    const Py_UNICODE *u,        /* Unicode buffer */
428    Py_ssize_t size             /* size of buffer */
429    );
430
431/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
432PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
433    const char *u,        /* char buffer */
434    Py_ssize_t size       /* size of buffer */
435    );
436
437/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
438   UTF-8 encoded bytes */
439PyAPI_FUNC(PyObject*) PyUnicode_FromString(
440    const char *u        /* string */
441    );
442
443/* Return a read-only pointer to the Unicode object's internal
444   Py_UNICODE buffer. */
445
446PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
447    PyObject *unicode           /* Unicode object */
448    );
449
450/* Get the length of the Unicode object. */
451
452PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
453    PyObject *unicode           /* Unicode object */
454    );
455
456/* Get the maximum ordinal for a Unicode character. */
457PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
458
459/* Resize an already allocated Unicode object to the new size length.
460
461   *unicode is modified to point to the new (resized) object and 0
462   returned on success.
463
464   This API may only be called by the function which also called the
465   Unicode constructor. The refcount on the object must be 1. Otherwise,
466   an error is returned.
467
468   Error handling is implemented as follows: an exception is set, -1
469   is returned and *unicode left untouched.
470
471*/
472
473PyAPI_FUNC(int) PyUnicode_Resize(
474    PyObject **unicode,         /* Pointer to the Unicode object */
475    Py_ssize_t length           /* New length */
476    );
477
478/* Coerce obj to an Unicode object and return a reference with
479   *incremented* refcount.
480
481   Coercion is done in the following way:
482
483   1. bytes, bytearray and other char buffer compatible objects are decoded
484      under the assumptions that they contain data using the current
485      default encoding. Decoding is done in "strict" mode.
486
487   2. All other objects (including Unicode objects) raise an
488      exception.
489
490   The API returns NULL in case of an error. The caller is responsible
491   for decref'ing the returned objects.
492
493*/
494
495PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
496    register PyObject *obj,     /* Object */
497    const char *encoding,       /* encoding */
498    const char *errors          /* error handling */
499    );
500
501/* Coerce obj to an Unicode object and return a reference with
502   *incremented* refcount.
503
504   Unicode objects are passed back as-is (subclasses are converted to
505   true Unicode objects), all other objects are delegated to
506   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
507   using UTF-8 encoding as basis for decoding the object.
508
509   The API returns NULL in case of an error. The caller is responsible
510   for decref'ing the returned objects.
511
512*/
513
514PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
515    register PyObject *obj      /* Object */
516    );
517
518PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
519    const char *format,   /* ASCII-encoded string  */
520    va_list vargs
521    );
522PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
523    const char *format,   /* ASCII-encoded string  */
524    ...
525    );
526
527/* Format the object based on the format_spec, as defined in PEP 3101
528   (Advanced String Formatting). */
529PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
530                                                 Py_UNICODE *format_spec,
531                                                 Py_ssize_t format_spec_len);
532
533PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
534PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
535PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
536PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
537
538/* Use only if you know it's a string */
539#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
540
541/* --- wchar_t support for platforms which support it --------------------- */
542
543#ifdef HAVE_WCHAR_H
544
545/* Create a Unicode Object from the wchar_t buffer w of the given
546   size.
547
548   The buffer is copied into the new object. */
549
550PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
551    register const wchar_t *w,  /* wchar_t buffer */
552    Py_ssize_t size             /* size of buffer */
553    );
554
555/* Copies the Unicode Object contents into the wchar_t buffer w.  At
556   most size wchar_t characters are copied.
557
558   Note that the resulting wchar_t string may or may not be
559   0-terminated.  It is the responsibility of the caller to make sure
560   that the wchar_t string is 0-terminated in case this is required by
561   the application.
562
563   Returns the number of wchar_t characters copied (excluding a
564   possibly trailing 0-termination character) or -1 in case of an
565   error. */
566
567PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
568    PyUnicodeObject *unicode,   /* Unicode object */
569    register wchar_t *w,        /* wchar_t buffer */
570    Py_ssize_t size             /* size of buffer */
571    );
572
573#endif
574
575/* --- Unicode ordinals --------------------------------------------------- */
576
577/* Create a Unicode Object from the given Unicode code point ordinal.
578
579   The ordinal must be in range(0x10000) on narrow Python builds
580   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
581   raised in case it is not.
582
583*/
584
585PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
586
587/* --- Free-list management ----------------------------------------------- */
588
589/* Clear the free list used by the Unicode implementation.
590
591   This can be used to release memory used for objects on the free
592   list back to the Python memory allocator.
593
594*/
595
596PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
597
598/* === Builtin Codecs =====================================================
599
600   Many of these APIs take two arguments encoding and errors. These
601   parameters encoding and errors have the same semantics as the ones
602   of the builtin unicode() API.
603
604   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
605
606   Error handling is set by errors which may also be set to NULL
607   meaning to use the default handling defined for the codec. Default
608   error handling for all builtin codecs is "strict" (ValueErrors are
609   raised).
610
611   The codecs all use a similar interface. Only deviation from the
612   generic ones are documented.
613
614*/
615
616/* --- Manage the default encoding ---------------------------------------- */
617
618/* Return a Python string holding the default encoded value of the
619   Unicode object.
620
621   The resulting string is cached in the Unicode object for subsequent
622   usage by this function. The cached version is needed to implement
623   the character buffer interface and will live (at least) as long as
624   the Unicode object itself.
625
626   The refcount of the string is *not* incremented.
627
628   *** Exported for internal use by the interpreter only !!! ***
629
630*/
631
632PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
633    PyObject *unicode,
634    const char *errors);
635
636/* Returns a pointer to the default encoding (normally, UTF-8) of the
637   Unicode object unicode and the size of the encoded representation
638   in bytes stored in *size.
639
640   In case of an error, no *size is set.
641
642   *** This API is for interpreter INTERNAL USE ONLY and will likely
643   *** be removed or changed for Python 3.1.
644
645   *** If you need to access the Unicode object as UTF-8 bytes string,
646   *** please use PyUnicode_AsUTF8String() instead.
647
648*/
649
650PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
651    PyObject *unicode,
652    Py_ssize_t *size);
653
654/* Returns a pointer to the default encoding (normally, UTf-8) of the
655   Unicode object unicode.
656
657   Use of this API is DEPRECATED since no size information can be
658   extracted from the returned data.
659
660   *** This API is for interpreter INTERNAL USE ONLY and will likely
661   *** be removed or changed for Python 3.1.
662
663   *** If you need to access the Unicode object as UTF-8 bytes string,
664   *** please use PyUnicode_AsUTF8String() instead.
665
666*/
667
668PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode);
669
670/* Returns the currently active default encoding.
671
672   The default encoding is currently implemented as run-time settable
673   process global.  This may change in future versions of the
674   interpreter to become a parameter which is managed on a per-thread
675   basis.
676
677 */
678
679PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
680
681/* --- Generic Codecs ----------------------------------------------------- */
682
683/* Create a Unicode object by decoding the encoded string s of the
684   given size. */
685
686PyAPI_FUNC(PyObject*) PyUnicode_Decode(
687    const char *s,              /* encoded string */
688    Py_ssize_t size,            /* size of buffer */
689    const char *encoding,       /* encoding */
690    const char *errors          /* error handling */
691    );
692
693/* Decode a Unicode object unicode and return the result as Python
694   object. */
695
696PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
697    PyObject *unicode,          /* Unicode object */
698    const char *encoding,       /* encoding */
699    const char *errors          /* error handling */
700    );
701
702/* Decode a Unicode object unicode and return the result as Unicode
703   object. */
704
705PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
706    PyObject *unicode,          /* Unicode object */
707    const char *encoding,       /* encoding */
708    const char *errors          /* error handling */
709    );
710
711/* Encodes a Py_UNICODE buffer of the given size and returns a
712   Python string object. */
713
714PyAPI_FUNC(PyObject*) PyUnicode_Encode(
715    const Py_UNICODE *s,        /* Unicode char buffer */
716    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
717    const char *encoding,       /* encoding */
718    const char *errors          /* error handling */
719    );
720
721/* Encodes a Unicode object and returns the result as Python
722   object. */
723
724PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
725    PyObject *unicode,          /* Unicode object */
726    const char *encoding,       /* encoding */
727    const char *errors          /* error handling */
728    );
729
730/* Encodes a Unicode object and returns the result as Python string
731   object. */
732
733PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
734    PyObject *unicode,          /* Unicode object */
735    const char *encoding,       /* encoding */
736    const char *errors          /* error handling */
737    );
738
739/* Encodes a Unicode object and returns the result as Unicode
740   object. */
741
742PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
743    PyObject *unicode,          /* Unicode object */
744    const char *encoding,       /* encoding */
745    const char *errors          /* error handling */
746    );
747
748/* Build an encoding map. */
749
750PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
751    PyObject* string            /* 256 character map */
752   );
753
754/* --- UTF-7 Codecs ------------------------------------------------------- */
755
756PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
757    const char *string,         /* UTF-7 encoded string */
758    Py_ssize_t length,          /* size of string */
759    const char *errors          /* error handling */
760    );
761
762PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
763    const char *string,         /* UTF-7 encoded string */
764    Py_ssize_t length,          /* size of string */
765    const char *errors,         /* error handling */
766    Py_ssize_t *consumed        /* bytes consumed */
767    );
768
769PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
770    const Py_UNICODE *data,     /* Unicode char buffer */
771    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
772    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
773    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
774    const char *errors          /* error handling */
775    );
776
777/* --- UTF-8 Codecs ------------------------------------------------------- */
778
779PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
780    const char *string,         /* UTF-8 encoded string */
781    Py_ssize_t length,          /* size of string */
782    const char *errors          /* error handling */
783    );
784
785PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
786    const char *string,         /* UTF-8 encoded string */
787    Py_ssize_t length,          /* size of string */
788    const char *errors,         /* error handling */
789    Py_ssize_t *consumed        /* bytes consumed */
790    );
791
792PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
793    PyObject *unicode           /* Unicode object */
794    );
795
796PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
797    const Py_UNICODE *data,     /* Unicode char buffer */
798    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
799    const char *errors          /* error handling */
800    );
801
802/* --- UTF-32 Codecs ------------------------------------------------------ */
803
804/* Decodes length bytes from a UTF-32 encoded buffer string and returns
805   the corresponding Unicode object.
806
807   errors (if non-NULL) defines the error handling. It defaults
808   to "strict".
809
810   If byteorder is non-NULL, the decoder starts decoding using the
811   given byte order:
812
813    *byteorder == -1: little endian
814    *byteorder == 0:  native order
815    *byteorder == 1:  big endian
816
817   In native mode, the first four bytes of the stream are checked for a
818   BOM mark. If found, the BOM mark is analysed, the byte order
819   adjusted and the BOM skipped.  In the other modes, no BOM mark
820   interpretation is done. After completion, *byteorder is set to the
821   current byte order at the end of input data.
822
823   If byteorder is NULL, the codec starts in native order mode.
824
825*/
826
827PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
828    const char *string,         /* UTF-32 encoded string */
829    Py_ssize_t length,          /* size of string */
830    const char *errors,         /* error handling */
831    int *byteorder              /* pointer to byteorder to use
832                                   0=native;-1=LE,1=BE; updated on
833                                   exit */
834    );
835
836PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
837    const char *string,         /* UTF-32 encoded string */
838    Py_ssize_t length,          /* size of string */
839    const char *errors,         /* error handling */
840    int *byteorder,             /* pointer to byteorder to use
841                                   0=native;-1=LE,1=BE; updated on
842                                   exit */
843    Py_ssize_t *consumed        /* bytes consumed */
844    );
845
846/* Returns a Python string using the UTF-32 encoding in native byte
847   order. The string always starts with a BOM mark.  */
848
849PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
850    PyObject *unicode           /* Unicode object */
851    );
852
853/* Returns a Python string object holding the UTF-32 encoded value of
854   the Unicode data.
855
856   If byteorder is not 0, output is written according to the following
857   byte order:
858
859   byteorder == -1: little endian
860   byteorder == 0:  native byte order (writes a BOM mark)
861   byteorder == 1:  big endian
862
863   If byteorder is 0, the output string will always start with the
864   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
865   prepended.
866
867*/
868
869PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
870    const Py_UNICODE *data,     /* Unicode char buffer */
871    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
872    const char *errors,         /* error handling */
873    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
874    );
875
876/* --- UTF-16 Codecs ------------------------------------------------------ */
877
878/* Decodes length bytes from a UTF-16 encoded buffer string and returns
879   the corresponding Unicode object.
880
881   errors (if non-NULL) defines the error handling. It defaults
882   to "strict".
883
884   If byteorder is non-NULL, the decoder starts decoding using the
885   given byte order:
886
887    *byteorder == -1: little endian
888    *byteorder == 0:  native order
889    *byteorder == 1:  big endian
890
891   In native mode, the first two bytes of the stream are checked for a
892   BOM mark. If found, the BOM mark is analysed, the byte order
893   adjusted and the BOM skipped.  In the other modes, no BOM mark
894   interpretation is done. After completion, *byteorder is set to the
895   current byte order at the end of input data.
896
897   If byteorder is NULL, the codec starts in native order mode.
898
899*/
900
901PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
902    const char *string,         /* UTF-16 encoded string */
903    Py_ssize_t length,          /* size of string */
904    const char *errors,         /* error handling */
905    int *byteorder              /* pointer to byteorder to use
906                                   0=native;-1=LE,1=BE; updated on
907                                   exit */
908    );
909
910PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
911    const char *string,         /* UTF-16 encoded string */
912    Py_ssize_t length,          /* size of string */
913    const char *errors,         /* error handling */
914    int *byteorder,             /* pointer to byteorder to use
915                                   0=native;-1=LE,1=BE; updated on
916                                   exit */
917    Py_ssize_t *consumed        /* bytes consumed */
918    );
919
920/* Returns a Python string using the UTF-16 encoding in native byte
921   order. The string always starts with a BOM mark.  */
922
923PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
924    PyObject *unicode           /* Unicode object */
925    );
926
927/* Returns a Python string object holding the UTF-16 encoded value of
928   the Unicode data.
929
930   If byteorder is not 0, output is written according to the following
931   byte order:
932
933   byteorder == -1: little endian
934   byteorder == 0:  native byte order (writes a BOM mark)
935   byteorder == 1:  big endian
936
937   If byteorder is 0, the output string will always start with the
938   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
939   prepended.
940
941   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
942   UCS-2. This trick makes it possible to add full UTF-16 capabilities
943   at a later point without compromising the APIs.
944
945*/
946
947PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
948    const Py_UNICODE *data,     /* Unicode char buffer */
949    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
950    const char *errors,         /* error handling */
951    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
952    );
953
954/* --- Unicode-Escape Codecs ---------------------------------------------- */
955
956PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
957    const char *string,         /* Unicode-Escape encoded string */
958    Py_ssize_t length,          /* size of string */
959    const char *errors          /* error handling */
960    );
961
962PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
963    PyObject *unicode           /* Unicode object */
964    );
965
966PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
967    const Py_UNICODE *data,     /* Unicode char buffer */
968    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
969    );
970
971/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
972
973PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
974    const char *string,         /* Raw-Unicode-Escape encoded string */
975    Py_ssize_t length,          /* size of string */
976    const char *errors          /* error handling */
977    );
978
979PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
980    PyObject *unicode           /* Unicode object */
981    );
982
983PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
984    const Py_UNICODE *data,     /* Unicode char buffer */
985    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
986    );
987
988/* --- Unicode Internal Codec ---------------------------------------------
989
990    Only for internal use in _codecsmodule.c */
991
992PyObject *_PyUnicode_DecodeUnicodeInternal(
993    const char *string,
994    Py_ssize_t length,
995    const char *errors
996    );
997
998/* --- Latin-1 Codecs -----------------------------------------------------
999
1000   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1001
1002*/
1003
1004PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1005    const char *string,         /* Latin-1 encoded string */
1006    Py_ssize_t length,          /* size of string */
1007    const char *errors          /* error handling */
1008    );
1009
1010PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1011    PyObject *unicode           /* Unicode object */
1012    );
1013
1014PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1015    const Py_UNICODE *data,     /* Unicode char buffer */
1016    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1017    const char *errors          /* error handling */
1018    );
1019
1020/* --- ASCII Codecs -------------------------------------------------------
1021
1022   Only 7-bit ASCII data is excepted. All other codes generate errors.
1023
1024*/
1025
1026PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1027    const char *string,         /* ASCII encoded string */
1028    Py_ssize_t length,          /* size of string */
1029    const char *errors          /* error handling */
1030    );
1031
1032PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1033    PyObject *unicode           /* Unicode object */
1034    );
1035
1036PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1037    const Py_UNICODE *data,     /* Unicode char buffer */
1038    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1039    const char *errors          /* error handling */
1040    );
1041
1042/* --- Character Map Codecs -----------------------------------------------
1043
1044   This codec uses mappings to encode and decode characters.
1045
1046   Decoding mappings must map single string characters to single
1047   Unicode characters, integers (which are then interpreted as Unicode
1048   ordinals) or None (meaning "undefined mapping" and causing an
1049   error).
1050
1051   Encoding mappings must map single Unicode characters to single
1052   string characters, integers (which are then interpreted as Latin-1
1053   ordinals) or None (meaning "undefined mapping" and causing an
1054   error).
1055
1056   If a character lookup fails with a LookupError, the character is
1057   copied as-is meaning that its ordinal value will be interpreted as
1058   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1059   to contain those mappings which map characters to different code
1060   points.
1061
1062*/
1063
1064PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1065    const char *string,         /* Encoded string */
1066    Py_ssize_t length,          /* size of string */
1067    PyObject *mapping,          /* character mapping
1068                                   (char ordinal -> unicode ordinal) */
1069    const char *errors          /* error handling */
1070    );
1071
1072PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1073    PyObject *unicode,          /* Unicode object */
1074    PyObject *mapping           /* character mapping
1075                                   (unicode ordinal -> char ordinal) */
1076    );
1077
1078PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1079    const Py_UNICODE *data,     /* Unicode char buffer */
1080    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1081    PyObject *mapping,          /* character mapping
1082                                   (unicode ordinal -> char ordinal) */
1083    const char *errors          /* error handling */
1084    );
1085
1086/* Translate a Py_UNICODE buffer of the given length by applying a
1087   character mapping table to it and return the resulting Unicode
1088   object.
1089
1090   The mapping table must map Unicode ordinal integers to Unicode
1091   ordinal integers or None (causing deletion of the character).
1092
1093   Mapping tables may be dictionaries or sequences. Unmapped character
1094   ordinals (ones which cause a LookupError) are left untouched and
1095   are copied as-is.
1096
1097*/
1098
1099PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1100    const Py_UNICODE *data,     /* Unicode char buffer */
1101    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1102    PyObject *table,            /* Translate table */
1103    const char *errors          /* error handling */
1104    );
1105
1106#ifdef MS_WIN32
1107
1108/* --- MBCS codecs for Windows -------------------------------------------- */
1109
1110PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1111    const char *string,         /* MBCS encoded string */
1112    Py_ssize_t length,              /* size of string */
1113    const char *errors          /* error handling */
1114    );
1115
1116PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1117    const char *string,         /* MBCS encoded string */
1118    Py_ssize_t length,          /* size of string */
1119    const char *errors,         /* error handling */
1120    Py_ssize_t *consumed        /* bytes consumed */
1121    );
1122
1123PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1124    PyObject *unicode           /* Unicode object */
1125    );
1126
1127PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1128    const Py_UNICODE *data,     /* Unicode char buffer */
1129    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1130    const char *errors          /* error handling */
1131    );
1132
1133#endif /* MS_WIN32 */
1134
1135/* --- Decimal Encoder ---------------------------------------------------- */
1136
1137/* Takes a Unicode string holding a decimal value and writes it into
1138   an output buffer using standard ASCII digit codes.
1139
1140   The output buffer has to provide at least length+1 bytes of storage
1141   area. The output string is 0-terminated.
1142
1143   The encoder converts whitespace to ' ', decimal characters to their
1144   corresponding ASCII digit and all other Latin-1 characters except
1145   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1146   are treated as errors. This includes embedded NULL bytes.
1147
1148   Error handling is defined by the errors argument:
1149
1150      NULL or "strict": raise a ValueError
1151      "ignore": ignore the wrong characters (these are not copied to the
1152                output buffer)
1153      "replace": replaces illegal characters with '?'
1154
1155   Returns 0 on success, -1 on failure.
1156
1157*/
1158
1159PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1160    Py_UNICODE *s,              /* Unicode buffer */
1161    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1162    char *output,               /* Output buffer; must have size >= length */
1163    const char *errors          /* error handling */
1164    );
1165
1166/* --- File system encoding ---------------------------------------------- */
1167
1168/* ParseTuple converter: encode str objects to bytes using
1169   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1170
1171PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1172
1173/* ParseTuple converter: decode bytes objects to unicode using
1174   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1175
1176PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1177
1178/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1179   and the "surrogateescape" error handler.
1180
1181   If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1182
1183   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1184*/
1185
1186PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1187    const char *s               /* encoded string */
1188    );
1189
1190/* Decode a string using Py_FileSystemDefaultEncoding
1191   and the "surrogateescape" error handler.
1192
1193   If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1194*/
1195
1196PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1197    const char *s,               /* encoded string */
1198    Py_ssize_t size              /* size */
1199    );
1200
1201/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1202   "surrogateescape" error handler, and return bytes.
1203
1204   If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8.
1205*/
1206
1207PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1208    PyObject *unicode
1209    );
1210
1211/* --- Methods & Slots ----------------------------------------------------
1212
1213   These are capable of handling Unicode objects and strings on input
1214   (we refer to them as strings in the descriptions) and return
1215   Unicode objects or integers as apporpriate. */
1216
1217/* Concat two strings giving a new Unicode string. */
1218
1219PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1220    PyObject *left,             /* Left string */
1221    PyObject *right             /* Right string */
1222    );
1223
1224/* Concat two strings and put the result in *pleft
1225   (sets *pleft to NULL on error) */
1226
1227PyAPI_FUNC(void) PyUnicode_Append(
1228    PyObject **pleft,           /* Pointer to left string */
1229    PyObject *right             /* Right string */
1230    );
1231
1232/* Concat two strings, put the result in *pleft and drop the right object
1233   (sets *pleft to NULL on error) */
1234
1235PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1236    PyObject **pleft,           /* Pointer to left string */
1237    PyObject *right             /* Right string */
1238    );
1239
1240/* Split a string giving a list of Unicode strings.
1241
1242   If sep is NULL, splitting will be done at all whitespace
1243   substrings. Otherwise, splits occur at the given separator.
1244
1245   At most maxsplit splits will be done. If negative, no limit is set.
1246
1247   Separators are not included in the resulting list.
1248
1249*/
1250
1251PyAPI_FUNC(PyObject*) PyUnicode_Split(
1252    PyObject *s,                /* String to split */
1253    PyObject *sep,              /* String separator */
1254    Py_ssize_t maxsplit         /* Maxsplit count */
1255    );
1256
1257/* Dito, but split at line breaks.
1258
1259   CRLF is considered to be one line break. Line breaks are not
1260   included in the resulting list. */
1261
1262PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1263    PyObject *s,                /* String to split */
1264    int keepends                /* If true, line end markers are included */
1265    );
1266
1267/* Partition a string using a given separator. */
1268
1269PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1270    PyObject *s,                /* String to partition */
1271    PyObject *sep               /* String separator */
1272    );
1273
1274/* Partition a string using a given separator, searching from the end of the
1275   string. */
1276
1277PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1278    PyObject *s,                /* String to partition */
1279    PyObject *sep               /* String separator */
1280    );
1281
1282/* Split a string giving a list of Unicode strings.
1283
1284   If sep is NULL, splitting will be done at all whitespace
1285   substrings. Otherwise, splits occur at the given separator.
1286
1287   At most maxsplit splits will be done. But unlike PyUnicode_Split
1288   PyUnicode_RSplit splits from the end of the string. If negative,
1289   no limit is set.
1290
1291   Separators are not included in the resulting list.
1292
1293*/
1294
1295PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1296    PyObject *s,                /* String to split */
1297    PyObject *sep,              /* String separator */
1298    Py_ssize_t maxsplit         /* Maxsplit count */
1299    );
1300
1301/* Translate a string by applying a character mapping table to it and
1302   return the resulting Unicode object.
1303
1304   The mapping table must map Unicode ordinal integers to Unicode
1305   ordinal integers or None (causing deletion of the character).
1306
1307   Mapping tables may be dictionaries or sequences. Unmapped character
1308   ordinals (ones which cause a LookupError) are left untouched and
1309   are copied as-is.
1310
1311*/
1312
1313PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1314    PyObject *str,              /* String */
1315    PyObject *table,            /* Translate table */
1316    const char *errors          /* error handling */
1317    );
1318
1319/* Join a sequence of strings using the given separator and return
1320   the resulting Unicode string. */
1321
1322PyAPI_FUNC(PyObject*) PyUnicode_Join(
1323    PyObject *separator,        /* Separator string */
1324    PyObject *seq               /* Sequence object */
1325    );
1326
1327/* Return 1 if substr matches str[start:end] at the given tail end, 0
1328   otherwise. */
1329
1330PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1331    PyObject *str,              /* String */
1332    PyObject *substr,           /* Prefix or Suffix string */
1333    Py_ssize_t start,           /* Start index */
1334    Py_ssize_t end,             /* Stop index */
1335    int direction               /* Tail end: -1 prefix, +1 suffix */
1336    );
1337
1338/* Return the first position of substr in str[start:end] using the
1339   given search direction or -1 if not found. -2 is returned in case
1340   an error occurred and an exception is set. */
1341
1342PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1343    PyObject *str,              /* String */
1344    PyObject *substr,           /* Substring to find */
1345    Py_ssize_t start,           /* Start index */
1346    Py_ssize_t end,             /* Stop index */
1347    int direction               /* Find direction: +1 forward, -1 backward */
1348    );
1349
1350/* Count the number of occurrences of substr in str[start:end]. */
1351
1352PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1353    PyObject *str,              /* String */
1354    PyObject *substr,           /* Substring to count */
1355    Py_ssize_t start,           /* Start index */
1356    Py_ssize_t end              /* Stop index */
1357    );
1358
1359/* Replace at most maxcount occurrences of substr in str with replstr
1360   and return the resulting Unicode object. */
1361
1362PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1363    PyObject *str,              /* String */
1364    PyObject *substr,           /* Substring to find */
1365    PyObject *replstr,          /* Substring to replace */
1366    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1367                                   -1 = all */
1368    );
1369
1370/* Compare two strings and return -1, 0, 1 for less than, equal,
1371   greater than resp. */
1372
1373PyAPI_FUNC(int) PyUnicode_Compare(
1374    PyObject *left,             /* Left string */
1375    PyObject *right             /* Right string */
1376    );
1377
1378PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1379    PyObject *left,
1380    const char *right
1381    );
1382
1383/* Rich compare two strings and return one of the following:
1384
1385   - NULL in case an exception was raised
1386   - Py_True or Py_False for successfuly comparisons
1387   - Py_NotImplemented in case the type combination is unknown
1388
1389   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1390   case the conversion of the arguments to Unicode fails with a
1391   UnicodeDecodeError.
1392
1393   Possible values for op:
1394
1395     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1396
1397*/
1398
1399PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1400    PyObject *left,             /* Left string */
1401    PyObject *right,            /* Right string */
1402    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1403    );
1404
1405/* Apply a argument tuple or dictionary to a format string and return
1406   the resulting Unicode string. */
1407
1408PyAPI_FUNC(PyObject *) PyUnicode_Format(
1409    PyObject *format,           /* Format string */
1410    PyObject *args              /* Argument tuple or dictionary */
1411    );
1412
1413/* Checks whether element is contained in container and return 1/0
1414   accordingly.
1415
1416   element has to coerce to an one element Unicode string. -1 is
1417   returned in case of an error. */
1418
1419PyAPI_FUNC(int) PyUnicode_Contains(
1420    PyObject *container,        /* Container string */
1421    PyObject *element           /* Element string */
1422    );
1423
1424/* Checks whether argument is a valid identifier. */
1425
1426PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1427
1428/* Externally visible for str.strip(unicode) */
1429PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1430    PyUnicodeObject *self,
1431    int striptype,
1432    PyObject *sepobj
1433    );
1434
1435/* Using the current locale, insert the thousands grouping
1436   into the string pointed to by buffer.  For the argument descriptions,
1437   see Objects/stringlib/localeutil.h */
1438
1439PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1440                                                   Py_ssize_t n_buffer,
1441                                                   Py_UNICODE *digits,
1442                                                   Py_ssize_t n_digits,
1443                                                   Py_ssize_t min_width);
1444
1445/* Using explicit passed-in values, insert the thousands grouping
1446   into the string pointed to by buffer.  For the argument descriptions,
1447   see Objects/stringlib/localeutil.h */
1448PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer,
1449                                                   Py_ssize_t n_buffer,
1450                                                   Py_UNICODE *digits,
1451                                                   Py_ssize_t n_digits,
1452                                                   Py_ssize_t min_width,
1453                                                   const char *grouping,
1454                                                   const char *thousands_sep);
1455/* === Characters Type APIs =============================================== */
1456
1457/* Helper array used by Py_UNICODE_ISSPACE(). */
1458
1459PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1460
1461/* These should not be used directly. Use the Py_UNICODE_IS* and
1462   Py_UNICODE_TO* macros instead.
1463
1464   These APIs are implemented in Objects/unicodectype.c.
1465
1466*/
1467
1468PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1469    Py_UCS4 ch       /* Unicode character */
1470    );
1471
1472PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1473    Py_UCS4 ch       /* Unicode character */
1474    );
1475
1476PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1477    Py_UCS4 ch       /* Unicode character */
1478    );
1479
1480PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1481    Py_UCS4 ch       /* Unicode character */
1482    );
1483
1484PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1485    Py_UCS4 ch       /* Unicode character */
1486    );
1487
1488PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1489    const Py_UCS4 ch         /* Unicode character */
1490    );
1491
1492PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1493    const Py_UCS4 ch         /* Unicode character */
1494    );
1495
1496PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1497    Py_UCS4 ch       /* Unicode character */
1498    );
1499
1500PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1501    Py_UCS4 ch       /* Unicode character */
1502    );
1503
1504PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1505    Py_UCS4 ch       /* Unicode character */
1506    );
1507
1508PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1509    Py_UCS4 ch       /* Unicode character */
1510    );
1511
1512PyAPI_FUNC(int) _PyUnicode_ToDigit(
1513    Py_UCS4 ch       /* Unicode character */
1514    );
1515
1516PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1517    Py_UCS4 ch       /* Unicode character */
1518    );
1519
1520PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1521    Py_UCS4 ch       /* Unicode character */
1522    );
1523
1524PyAPI_FUNC(int) _PyUnicode_IsDigit(
1525    Py_UCS4 ch       /* Unicode character */
1526    );
1527
1528PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1529    Py_UCS4 ch       /* Unicode character */
1530    );
1531
1532PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1533    Py_UCS4 ch       /* Unicode character */
1534    );
1535
1536PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1537    Py_UCS4 ch       /* Unicode character */
1538    );
1539
1540PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1541    const Py_UNICODE *u
1542    );
1543
1544PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1545    Py_UNICODE *s1,
1546    const Py_UNICODE *s2);
1547
1548PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1549    Py_UNICODE *s1, const Py_UNICODE *s2);
1550
1551PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1552    Py_UNICODE *s1,
1553    const Py_UNICODE *s2,
1554    size_t n);
1555
1556PyAPI_FUNC(int) Py_UNICODE_strcmp(
1557    const Py_UNICODE *s1,
1558    const Py_UNICODE *s2
1559    );
1560
1561PyAPI_FUNC(int) Py_UNICODE_strncmp(
1562    const Py_UNICODE *s1,
1563    const Py_UNICODE *s2,
1564    size_t n
1565    );
1566
1567PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1568    const Py_UNICODE *s,
1569    Py_UNICODE c
1570    );
1571
1572PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
1573    const Py_UNICODE *s,
1574    Py_UNICODE c
1575    );
1576
1577/* Create a copy of a unicode string ending with a nul character. Return NULL
1578   and raise a MemoryError exception on memory allocation failure, otherwise
1579   return a new allocated buffer (use PyMem_Free() to free the buffer). */
1580
1581PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
1582    PyObject *unicode
1583    );
1584
1585#ifdef __cplusplus
1586}
1587#endif
1588#endif /* !Py_UNICODEOBJECT_H */
1589