unicodeobject.h revision 4d0d471a8031de90a2b1ce99c4ac4780e60b3bc9
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
68   properly set, but the default rules below doesn't set it.  I'll
69   sort this out some other day -- fredrik@pythonware.com */
70
71#ifndef Py_UNICODE_SIZE
72#error Must define Py_UNICODE_SIZE
73#endif
74
75/* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
76   strings are stored as UCS-2 (with limited support for UTF-16) */
77
78#if Py_UNICODE_SIZE >= 4
79#define Py_UNICODE_WIDE
80#endif
81
82/* Set these flags if the platform has "wchar.h" and the
83   wchar_t type is a 16-bit unsigned type */
84/* #define HAVE_WCHAR_H */
85/* #define HAVE_USABLE_WCHAR_T */
86
87/* Defaults for various platforms */
88#ifndef PY_UNICODE_TYPE
89
90/* Windows has a usable wchar_t type (unless we're using UCS-4) */
91# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
92#  define HAVE_USABLE_WCHAR_T
93#  define PY_UNICODE_TYPE wchar_t
94# endif
95
96# if defined(Py_UNICODE_WIDE)
97#  define PY_UNICODE_TYPE Py_UCS4
98# endif
99
100#endif
101
102/* If the compiler provides a wchar_t type we try to support it
103   through the interface functions PyUnicode_FromWideChar(),
104   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
105
106#ifdef HAVE_USABLE_WCHAR_T
107# ifndef HAVE_WCHAR_H
108#  define HAVE_WCHAR_H
109# endif
110#endif
111
112#ifdef HAVE_WCHAR_H
113/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
114# ifdef _HAVE_BSDI
115#  include <time.h>
116# endif
117#  include <wchar.h>
118#endif
119
120/*
121 * Use this typedef when you need to represent a UTF-16 surrogate pair
122 * as single unsigned integer.
123 */
124#if SIZEOF_INT >= 4
125typedef unsigned int Py_UCS4;
126#elif SIZEOF_LONG >= 4
127typedef unsigned long Py_UCS4;
128#endif
129
130/* Py_UNICODE is the native Unicode storage format (code unit) used by
131   Python and represents a single Unicode element in the Unicode
132   type. */
133
134#ifndef Py_LIMITED_API
135typedef PY_UNICODE_TYPE Py_UNICODE;
136#endif
137
138/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
139
140/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
141   produce different external names and thus cause import errors in
142   case Python interpreters and extensions with mixed compiled in
143   Unicode width assumptions are combined. */
144
145#ifndef Py_UNICODE_WIDE
146
147# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
148# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
149# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
150# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
151# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
152# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
153# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
154# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
155# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
156# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
157# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
158# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
159# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
160# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
161# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
162# define PyUnicode_AsWideCharString PyUnicodeUCS2_AsWideCharString
163# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
164# define PyUnicode_Compare PyUnicodeUCS2_Compare
165# define PyUnicode_CompareWithASCIIString PyUnicodeUCS2_CompareWithASCIIString
166# define PyUnicode_Concat PyUnicodeUCS2_Concat
167# define PyUnicode_Append PyUnicodeUCS2_Append
168# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
169# define PyUnicode_Contains PyUnicodeUCS2_Contains
170# define PyUnicode_Count PyUnicodeUCS2_Count
171# define PyUnicode_Decode PyUnicodeUCS2_Decode
172# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
173# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
174# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
175# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
176# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
177# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
178# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
179# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
180# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
181# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
182# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
183# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
184# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
185# define PyUnicode_Encode PyUnicodeUCS2_Encode
186# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
187# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
188# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
189# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
190# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
191# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
192# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
193# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
194# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
195# define PyUnicode_Find PyUnicodeUCS2_Find
196# define PyUnicode_Format PyUnicodeUCS2_Format
197# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
198# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
199# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
200# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
201# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
202# define PyUnicode_FromString PyUnicodeUCS2_FromString
203# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
204# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
205# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
206# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
207# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder
208# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
209# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
210# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
211# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
212# define PyUnicode_Join PyUnicodeUCS2_Join
213# define PyUnicode_Partition PyUnicodeUCS2_Partition
214# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
215# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
216# define PyUnicode_Replace PyUnicodeUCS2_Replace
217# define PyUnicode_Resize PyUnicodeUCS2_Resize
218# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
219# define PyUnicode_Split PyUnicodeUCS2_Split
220# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
221# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
222# define PyUnicode_Translate PyUnicodeUCS2_Translate
223# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
224# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
225# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
226# define _PyUnicode_Init _PyUnicodeUCS2_Init
227# define PyUnicode_strdup PyUnicodeUCS2_strdup
228
229#else
230
231# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
232# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
233# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
234# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
235# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
236# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
237# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
238# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
239# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
240# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
241# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
242# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
243# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
244# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
245# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
246# define PyUnicode_AsWideCharString PyUnicodeUCS4_AsWideCharString
247# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
248# define PyUnicode_Compare PyUnicodeUCS4_Compare
249# define PyUnicode_CompareWithASCIIString PyUnicodeUCS4_CompareWithASCIIString
250# define PyUnicode_Concat PyUnicodeUCS4_Concat
251# define PyUnicode_Append PyUnicodeUCS4_Append
252# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
253# define PyUnicode_Contains PyUnicodeUCS4_Contains
254# define PyUnicode_Count PyUnicodeUCS4_Count
255# define PyUnicode_Decode PyUnicodeUCS4_Decode
256# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
257# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
258# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
259# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
260# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
261# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
262# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
263# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
264# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
265# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
266# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
267# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
268# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
269# define PyUnicode_Encode PyUnicodeUCS4_Encode
270# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
271# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
272# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
273# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
274# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
275# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
276# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
277# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
278# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
279# define PyUnicode_Find PyUnicodeUCS4_Find
280# define PyUnicode_Format PyUnicodeUCS4_Format
281# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
282# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
283# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
284# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
285# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
286# define PyUnicode_FromString PyUnicodeUCS4_FromString
287# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
288# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
289# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
290# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
291# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder
292# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
293# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
294# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
295# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
296# define PyUnicode_Join PyUnicodeUCS4_Join
297# define PyUnicode_Partition PyUnicodeUCS4_Partition
298# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
299# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
300# define PyUnicode_Replace PyUnicodeUCS4_Replace
301# define PyUnicode_Resize PyUnicodeUCS4_Resize
302# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
303# define PyUnicode_Split PyUnicodeUCS4_Split
304# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
305# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
306# define PyUnicode_Translate PyUnicodeUCS4_Translate
307# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
308# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
309# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
310# define _PyUnicode_Init _PyUnicodeUCS4_Init
311# define PyUnicode_strdup PyUnicodeUCS4_strdup
312
313#endif
314
315/* --- Internal Unicode Operations ---------------------------------------- */
316
317/* Since splitting on whitespace is an important use case, and
318   whitespace in most situations is solely ASCII whitespace, we
319   optimize for the common case by using a quick look-up table
320   _Py_ascii_whitespace (see below) with an inlined check.
321
322 */
323#ifndef Py_LIMITED_API
324#define Py_UNICODE_ISSPACE(ch) \
325    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
326
327#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
328#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
329#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
330#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
331
332#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
333#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
334#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
335
336#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
337#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
338#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
339#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
340
341#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
342#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
343#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
344
345#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
346
347#define Py_UNICODE_ISALNUM(ch) \
348       (Py_UNICODE_ISALPHA(ch) || \
349    Py_UNICODE_ISDECIMAL(ch) || \
350    Py_UNICODE_ISDIGIT(ch) || \
351    Py_UNICODE_ISNUMERIC(ch))
352
353#define Py_UNICODE_COPY(target, source, length)                         \
354    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
355
356#define Py_UNICODE_FILL(target, value, length) \
357    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
358    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
359    } while (0)
360
361/* Check if substring matches at given offset.  The offset must be
362   valid, and the substring must not be empty. */
363
364#define Py_UNICODE_MATCH(string, offset, substring) \
365    ((*((string)->str + (offset)) == *((substring)->str)) && \
366    ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
367     !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
368#endif /* Py_LIMITED_API */
369
370#ifdef __cplusplus
371extern "C" {
372#endif
373
374/* --- Unicode Type ------------------------------------------------------- */
375
376#ifndef Py_LIMITED_API
377typedef struct {
378    PyObject_HEAD
379    Py_ssize_t length;          /* Length of raw Unicode data in buffer */
380    Py_UNICODE *str;            /* Raw Unicode buffer */
381    Py_hash_t hash;             /* Hash value; -1 if not set */
382    int state;                  /* != 0 if interned. In this case the two
383                                 * references from the dictionary to this object
384                                 * are *not* counted in ob_refcnt. */
385    PyObject *defenc;           /* (Default) Encoded version as Python
386                                   string, or NULL; this is used for
387                                   implementing the buffer protocol */
388} PyUnicodeObject;
389#endif
390
391PyAPI_DATA(PyTypeObject) PyUnicode_Type;
392PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
393
394#define SSTATE_NOT_INTERNED 0
395#define SSTATE_INTERNED_MORTAL 1
396#define SSTATE_INTERNED_IMMORTAL 2
397
398#define PyUnicode_Check(op) \
399                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
400#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
401
402/* Fast access macros */
403#ifndef Py_LIMITED_API
404#define PyUnicode_GET_SIZE(op) \
405    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
406#define PyUnicode_GET_DATA_SIZE(op) \
407    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
408#define PyUnicode_AS_UNICODE(op) \
409    (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
410#define PyUnicode_AS_DATA(op) \
411    (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
412#endif
413
414/* --- Constants ---------------------------------------------------------- */
415
416/* This Unicode character will be used as replacement character during
417   decoding if the errors argument is set to "replace". Note: the
418   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
419   Unicode 3.0. */
420
421#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
422
423/* === Public API ========================================================= */
424
425/* --- Plain Py_UNICODE --------------------------------------------------- */
426
427/* Create a Unicode Object from the Py_UNICODE buffer u of the given
428   size.
429
430   u may be NULL which causes the contents to be undefined. It is the
431   user's responsibility to fill in the needed data afterwards. Note
432   that modifying the Unicode object contents after construction is
433   only allowed if u was set to NULL.
434
435   The buffer is copied into the new object. */
436
437#ifndef Py_LIMITED_API
438PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
439    const Py_UNICODE *u,        /* Unicode buffer */
440    Py_ssize_t size             /* size of buffer */
441    );
442#endif
443
444/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
445PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
446    const char *u,        /* char buffer */
447    Py_ssize_t size       /* size of buffer */
448    );
449
450/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
451   UTF-8 encoded bytes */
452PyAPI_FUNC(PyObject*) PyUnicode_FromString(
453    const char *u        /* string */
454    );
455
456/* Return a read-only pointer to the Unicode object's internal
457   Py_UNICODE buffer. */
458
459#ifndef Py_LIMITED_API
460PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
461    PyObject *unicode           /* Unicode object */
462    );
463#endif
464
465/* Get the length of the Unicode object. */
466
467PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
468    PyObject *unicode           /* Unicode object */
469    );
470
471#ifndef Py_LIMITED_API
472/* Get the maximum ordinal for a Unicode character. */
473PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
474#endif
475
476/* Resize an already allocated Unicode object to the new size length.
477
478   *unicode is modified to point to the new (resized) object and 0
479   returned on success.
480
481   This API may only be called by the function which also called the
482   Unicode constructor. The refcount on the object must be 1. Otherwise,
483   an error is returned.
484
485   Error handling is implemented as follows: an exception is set, -1
486   is returned and *unicode left untouched.
487
488*/
489
490PyAPI_FUNC(int) PyUnicode_Resize(
491    PyObject **unicode,         /* Pointer to the Unicode object */
492    Py_ssize_t length           /* New length */
493    );
494
495/* Coerce obj to an Unicode object and return a reference with
496   *incremented* refcount.
497
498   Coercion is done in the following way:
499
500   1. bytes, bytearray and other char buffer compatible objects are decoded
501      under the assumptions that they contain data using the UTF-8
502      encoding. Decoding is done in "strict" mode.
503
504   2. All other objects (including Unicode objects) raise an
505      exception.
506
507   The API returns NULL in case of an error. The caller is responsible
508   for decref'ing the returned objects.
509
510*/
511
512PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
513    register PyObject *obj,     /* Object */
514    const char *encoding,       /* encoding */
515    const char *errors          /* error handling */
516    );
517
518/* Coerce obj to an Unicode object and return a reference with
519   *incremented* refcount.
520
521   Unicode objects are passed back as-is (subclasses are converted to
522   true Unicode objects), all other objects are delegated to
523   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
524   using UTF-8 encoding as basis for decoding the object.
525
526   The API returns NULL in case of an error. The caller is responsible
527   for decref'ing the returned objects.
528
529*/
530
531PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
532    register PyObject *obj      /* Object */
533    );
534
535PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
536    const char *format,   /* ASCII-encoded string  */
537    va_list vargs
538    );
539PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
540    const char *format,   /* ASCII-encoded string  */
541    ...
542    );
543
544#ifndef Py_LIMITED_API
545/* Format the object based on the format_spec, as defined in PEP 3101
546   (Advanced String Formatting). */
547PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
548                                                 Py_UNICODE *format_spec,
549                                                 Py_ssize_t format_spec_len);
550#endif
551
552PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
553PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
554PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
555#ifndef Py_LIMITED_API
556PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
557#endif
558
559/* Use only if you know it's a string */
560#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
561
562/* --- wchar_t support for platforms which support it --------------------- */
563
564#ifdef HAVE_WCHAR_H
565
566/* Create a Unicode Object from the wchar_t buffer w of the given
567   size.
568
569   The buffer is copied into the new object. */
570
571PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
572    register const wchar_t *w,  /* wchar_t buffer */
573    Py_ssize_t size             /* size of buffer */
574    );
575
576/* Copies the Unicode Object contents into the wchar_t buffer w.  At
577   most size wchar_t characters are copied.
578
579   Note that the resulting wchar_t string may or may not be
580   0-terminated.  It is the responsibility of the caller to make sure
581   that the wchar_t string is 0-terminated in case this is required by
582   the application.
583
584   Returns the number of wchar_t characters copied (excluding a
585   possibly trailing 0-termination character) or -1 in case of an
586   error. */
587
588PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
589    PyObject *unicode,          /* Unicode object */
590    register wchar_t *w,        /* wchar_t buffer */
591    Py_ssize_t size             /* size of buffer */
592    );
593
594/* Convert the Unicode object to a wide character string. The output string
595   always ends with a nul character. If size is not NULL, write the number of
596   wide characters (including the nul character) into *size.
597
598   Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
599   on success. On error, returns NULL, *size is undefined and raises a
600   MemoryError. */
601
602PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
603    PyObject *unicode,          /* Unicode object */
604    Py_ssize_t *size            /* number of characters of the result */
605    );
606
607#endif
608
609/* --- Unicode ordinals --------------------------------------------------- */
610
611/* Create a Unicode Object from the given Unicode code point ordinal.
612
613   The ordinal must be in range(0x10000) on narrow Python builds
614   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
615   raised in case it is not.
616
617*/
618
619PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
620
621/* --- Free-list management ----------------------------------------------- */
622
623/* Clear the free list used by the Unicode implementation.
624
625   This can be used to release memory used for objects on the free
626   list back to the Python memory allocator.
627
628*/
629
630PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
631
632/* === Builtin Codecs =====================================================
633
634   Many of these APIs take two arguments encoding and errors. These
635   parameters encoding and errors have the same semantics as the ones
636   of the builtin str() API.
637
638   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
639
640   Error handling is set by errors which may also be set to NULL
641   meaning to use the default handling defined for the codec. Default
642   error handling for all builtin codecs is "strict" (ValueErrors are
643   raised).
644
645   The codecs all use a similar interface. Only deviation from the
646   generic ones are documented.
647
648*/
649
650/* --- Manage the default encoding ---------------------------------------- */
651
652/* Return a Python string holding the default encoded value of the
653   Unicode object.
654
655   Same as PyUnicode_AsUTF8String() except
656   the resulting string is cached in the Unicode object for subsequent
657   usage by this function. The cached version is needed to implement
658   the character buffer interface and will live (at least) as long as
659   the Unicode object itself.
660
661   The refcount of the string is *not* incremented.
662
663   *** Exported for internal use by the interpreter only !!! ***
664
665*/
666
667#ifndef Py_LIMITED_API
668PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
669    PyObject *unicode,
670    const char *errors);
671#endif
672
673/* Returns a pointer to the default encoding (UTF-8) of the
674   Unicode object unicode and the size of the encoded representation
675   in bytes stored in *size.
676
677   In case of an error, no *size is set.
678
679   *** This API is for interpreter INTERNAL USE ONLY and will likely
680   *** be removed or changed in the future.
681
682   *** If you need to access the Unicode object as UTF-8 bytes string,
683   *** please use PyUnicode_AsUTF8String() instead.
684
685*/
686
687#ifndef Py_LIMITED_API
688PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
689    PyObject *unicode,
690    Py_ssize_t *size);
691#endif
692
693/* Returns a pointer to the default encoding (UTF-8) of the
694   Unicode object unicode.
695
696   Use of this API is DEPRECATED since no size information can be
697   extracted from the returned data.
698
699   *** This API is for interpreter INTERNAL USE ONLY and will likely
700   *** be removed or changed for Python 3.1.
701
702   *** If you need to access the Unicode object as UTF-8 bytes string,
703   *** please use PyUnicode_AsUTF8String() instead.
704
705*/
706
707#ifndef Py_LIMITED_API
708PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode);
709#endif
710
711/* Returns "utf-8".  */
712
713PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
714
715/* --- Generic Codecs ----------------------------------------------------- */
716
717/* Create a Unicode object by decoding the encoded string s of the
718   given size. */
719
720PyAPI_FUNC(PyObject*) PyUnicode_Decode(
721    const char *s,              /* encoded string */
722    Py_ssize_t size,            /* size of buffer */
723    const char *encoding,       /* encoding */
724    const char *errors          /* error handling */
725    );
726
727/* Decode a Unicode object unicode and return the result as Python
728   object. */
729
730PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
731    PyObject *unicode,          /* Unicode object */
732    const char *encoding,       /* encoding */
733    const char *errors          /* error handling */
734    );
735
736/* Decode a Unicode object unicode and return the result as Unicode
737   object. */
738
739PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
740    PyObject *unicode,          /* Unicode object */
741    const char *encoding,       /* encoding */
742    const char *errors          /* error handling */
743    );
744
745/* Encodes a Py_UNICODE buffer of the given size and returns a
746   Python string object. */
747
748#ifndef Py_LIMITED_API
749PyAPI_FUNC(PyObject*) PyUnicode_Encode(
750    const Py_UNICODE *s,        /* Unicode char buffer */
751    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
752    const char *encoding,       /* encoding */
753    const char *errors          /* error handling */
754    );
755#endif
756
757/* Encodes a Unicode object and returns the result as Python
758   object. */
759
760PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
761    PyObject *unicode,          /* Unicode object */
762    const char *encoding,       /* encoding */
763    const char *errors          /* error handling */
764    );
765
766/* Encodes a Unicode object and returns the result as Python string
767   object. */
768
769PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
770    PyObject *unicode,          /* Unicode object */
771    const char *encoding,       /* encoding */
772    const char *errors          /* error handling */
773    );
774
775/* Encodes a Unicode object and returns the result as Unicode
776   object. */
777
778PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
779    PyObject *unicode,          /* Unicode object */
780    const char *encoding,       /* encoding */
781    const char *errors          /* error handling */
782    );
783
784/* Build an encoding map. */
785
786PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
787    PyObject* string            /* 256 character map */
788   );
789
790/* --- UTF-7 Codecs ------------------------------------------------------- */
791
792PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
793    const char *string,         /* UTF-7 encoded string */
794    Py_ssize_t length,          /* size of string */
795    const char *errors          /* error handling */
796    );
797
798PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
799    const char *string,         /* UTF-7 encoded string */
800    Py_ssize_t length,          /* size of string */
801    const char *errors,         /* error handling */
802    Py_ssize_t *consumed        /* bytes consumed */
803    );
804
805#ifndef Py_LIMITED_API
806PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
807    const Py_UNICODE *data,     /* Unicode char buffer */
808    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
809    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
810    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
811    const char *errors          /* error handling */
812    );
813#endif
814
815/* --- UTF-8 Codecs ------------------------------------------------------- */
816
817PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
818    const char *string,         /* UTF-8 encoded string */
819    Py_ssize_t length,          /* size of string */
820    const char *errors          /* error handling */
821    );
822
823PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
824    const char *string,         /* UTF-8 encoded string */
825    Py_ssize_t length,          /* size of string */
826    const char *errors,         /* error handling */
827    Py_ssize_t *consumed        /* bytes consumed */
828    );
829
830PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
831    PyObject *unicode           /* Unicode object */
832    );
833
834#ifndef Py_LIMITED_API
835PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
836    const Py_UNICODE *data,     /* Unicode char buffer */
837    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
838    const char *errors          /* error handling */
839    );
840#endif
841
842/* --- UTF-32 Codecs ------------------------------------------------------ */
843
844/* Decodes length bytes from a UTF-32 encoded buffer string and returns
845   the corresponding Unicode object.
846
847   errors (if non-NULL) defines the error handling. It defaults
848   to "strict".
849
850   If byteorder is non-NULL, the decoder starts decoding using the
851   given byte order:
852
853    *byteorder == -1: little endian
854    *byteorder == 0:  native order
855    *byteorder == 1:  big endian
856
857   In native mode, the first four bytes of the stream are checked for a
858   BOM mark. If found, the BOM mark is analysed, the byte order
859   adjusted and the BOM skipped.  In the other modes, no BOM mark
860   interpretation is done. After completion, *byteorder is set to the
861   current byte order at the end of input data.
862
863   If byteorder is NULL, the codec starts in native order mode.
864
865*/
866
867PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
868    const char *string,         /* UTF-32 encoded string */
869    Py_ssize_t length,          /* size of string */
870    const char *errors,         /* error handling */
871    int *byteorder              /* pointer to byteorder to use
872                                   0=native;-1=LE,1=BE; updated on
873                                   exit */
874    );
875
876PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
877    const char *string,         /* UTF-32 encoded string */
878    Py_ssize_t length,          /* size of string */
879    const char *errors,         /* error handling */
880    int *byteorder,             /* pointer to byteorder to use
881                                   0=native;-1=LE,1=BE; updated on
882                                   exit */
883    Py_ssize_t *consumed        /* bytes consumed */
884    );
885
886/* Returns a Python string using the UTF-32 encoding in native byte
887   order. The string always starts with a BOM mark.  */
888
889PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
890    PyObject *unicode           /* Unicode object */
891    );
892
893/* Returns a Python string object holding the UTF-32 encoded value of
894   the Unicode data.
895
896   If byteorder is not 0, output is written according to the following
897   byte order:
898
899   byteorder == -1: little endian
900   byteorder == 0:  native byte order (writes a BOM mark)
901   byteorder == 1:  big endian
902
903   If byteorder is 0, the output string will always start with the
904   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
905   prepended.
906
907*/
908
909#ifndef Py_LIMITED_API
910PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
911    const Py_UNICODE *data,     /* Unicode char buffer */
912    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
913    const char *errors,         /* error handling */
914    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
915    );
916#endif
917
918/* --- UTF-16 Codecs ------------------------------------------------------ */
919
920/* Decodes length bytes from a UTF-16 encoded buffer string and returns
921   the corresponding Unicode object.
922
923   errors (if non-NULL) defines the error handling. It defaults
924   to "strict".
925
926   If byteorder is non-NULL, the decoder starts decoding using the
927   given byte order:
928
929    *byteorder == -1: little endian
930    *byteorder == 0:  native order
931    *byteorder == 1:  big endian
932
933   In native mode, the first two bytes of the stream are checked for a
934   BOM mark. If found, the BOM mark is analysed, the byte order
935   adjusted and the BOM skipped.  In the other modes, no BOM mark
936   interpretation is done. After completion, *byteorder is set to the
937   current byte order at the end of input data.
938
939   If byteorder is NULL, the codec starts in native order mode.
940
941*/
942
943PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
944    const char *string,         /* UTF-16 encoded string */
945    Py_ssize_t length,          /* size of string */
946    const char *errors,         /* error handling */
947    int *byteorder              /* pointer to byteorder to use
948                                   0=native;-1=LE,1=BE; updated on
949                                   exit */
950    );
951
952PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
953    const char *string,         /* UTF-16 encoded string */
954    Py_ssize_t length,          /* size of string */
955    const char *errors,         /* error handling */
956    int *byteorder,             /* pointer to byteorder to use
957                                   0=native;-1=LE,1=BE; updated on
958                                   exit */
959    Py_ssize_t *consumed        /* bytes consumed */
960    );
961
962/* Returns a Python string using the UTF-16 encoding in native byte
963   order. The string always starts with a BOM mark.  */
964
965PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
966    PyObject *unicode           /* Unicode object */
967    );
968
969/* Returns a Python string object holding the UTF-16 encoded value of
970   the Unicode data.
971
972   If byteorder is not 0, output is written according to the following
973   byte order:
974
975   byteorder == -1: little endian
976   byteorder == 0:  native byte order (writes a BOM mark)
977   byteorder == 1:  big endian
978
979   If byteorder is 0, the output string will always start with the
980   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
981   prepended.
982
983   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
984   UCS-2. This trick makes it possible to add full UTF-16 capabilities
985   at a later point without compromising the APIs.
986
987*/
988
989#ifndef Py_LIMITED_API
990PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
991    const Py_UNICODE *data,     /* Unicode char buffer */
992    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
993    const char *errors,         /* error handling */
994    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
995    );
996#endif
997
998/* --- Unicode-Escape Codecs ---------------------------------------------- */
999
1000PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1001    const char *string,         /* Unicode-Escape encoded string */
1002    Py_ssize_t length,          /* size of string */
1003    const char *errors          /* error handling */
1004    );
1005
1006PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1007    PyObject *unicode           /* Unicode object */
1008    );
1009
1010#ifndef Py_LIMITED_API
1011PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1012    const Py_UNICODE *data,     /* Unicode char buffer */
1013    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1014    );
1015#endif
1016
1017/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1018
1019PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1020    const char *string,         /* Raw-Unicode-Escape encoded string */
1021    Py_ssize_t length,          /* size of string */
1022    const char *errors          /* error handling */
1023    );
1024
1025PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1026    PyObject *unicode           /* Unicode object */
1027    );
1028
1029#ifndef Py_LIMITED_API
1030PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1031    const Py_UNICODE *data,     /* Unicode char buffer */
1032    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1033    );
1034#endif
1035
1036/* --- Unicode Internal Codec ---------------------------------------------
1037
1038    Only for internal use in _codecsmodule.c */
1039
1040#ifndef Py_LIMITED_API
1041PyObject *_PyUnicode_DecodeUnicodeInternal(
1042    const char *string,
1043    Py_ssize_t length,
1044    const char *errors
1045    );
1046#endif
1047
1048/* --- Latin-1 Codecs -----------------------------------------------------
1049
1050   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1051
1052*/
1053
1054PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1055    const char *string,         /* Latin-1 encoded string */
1056    Py_ssize_t length,          /* size of string */
1057    const char *errors          /* error handling */
1058    );
1059
1060PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1061    PyObject *unicode           /* Unicode object */
1062    );
1063
1064#ifndef Py_LIMITED_API
1065PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1066    const Py_UNICODE *data,     /* Unicode char buffer */
1067    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1068    const char *errors          /* error handling */
1069    );
1070#endif
1071
1072/* --- ASCII Codecs -------------------------------------------------------
1073
1074   Only 7-bit ASCII data is excepted. All other codes generate errors.
1075
1076*/
1077
1078PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1079    const char *string,         /* ASCII encoded string */
1080    Py_ssize_t length,          /* size of string */
1081    const char *errors          /* error handling */
1082    );
1083
1084PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1085    PyObject *unicode           /* Unicode object */
1086    );
1087
1088#ifndef Py_LIMITED_API
1089PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1090    const Py_UNICODE *data,     /* Unicode char buffer */
1091    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1092    const char *errors          /* error handling */
1093    );
1094#endif
1095
1096/* --- Character Map Codecs -----------------------------------------------
1097
1098   This codec uses mappings to encode and decode characters.
1099
1100   Decoding mappings must map single string characters to single
1101   Unicode characters, integers (which are then interpreted as Unicode
1102   ordinals) or None (meaning "undefined mapping" and causing an
1103   error).
1104
1105   Encoding mappings must map single Unicode characters to single
1106   string characters, integers (which are then interpreted as Latin-1
1107   ordinals) or None (meaning "undefined mapping" and causing an
1108   error).
1109
1110   If a character lookup fails with a LookupError, the character is
1111   copied as-is meaning that its ordinal value will be interpreted as
1112   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1113   to contain those mappings which map characters to different code
1114   points.
1115
1116*/
1117
1118PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1119    const char *string,         /* Encoded string */
1120    Py_ssize_t length,          /* size of string */
1121    PyObject *mapping,          /* character mapping
1122                                   (char ordinal -> unicode ordinal) */
1123    const char *errors          /* error handling */
1124    );
1125
1126PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1127    PyObject *unicode,          /* Unicode object */
1128    PyObject *mapping           /* character mapping
1129                                   (unicode ordinal -> char ordinal) */
1130    );
1131
1132#ifndef Py_LIMITED_API
1133PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1134    const Py_UNICODE *data,     /* Unicode char buffer */
1135    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1136    PyObject *mapping,          /* character mapping
1137                                   (unicode ordinal -> char ordinal) */
1138    const char *errors          /* error handling */
1139    );
1140#endif
1141
1142/* Translate a Py_UNICODE buffer of the given length by applying a
1143   character mapping table to it and return the resulting Unicode
1144   object.
1145
1146   The mapping table must map Unicode ordinal integers to Unicode
1147   ordinal integers or None (causing deletion of the character).
1148
1149   Mapping tables may be dictionaries or sequences. Unmapped character
1150   ordinals (ones which cause a LookupError) are left untouched and
1151   are copied as-is.
1152
1153*/
1154
1155#ifndef Py_LIMITED_API
1156PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1157    const Py_UNICODE *data,     /* Unicode char buffer */
1158    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1159    PyObject *table,            /* Translate table */
1160    const char *errors          /* error handling */
1161    );
1162#endif
1163
1164#ifdef MS_WIN32
1165
1166/* --- MBCS codecs for Windows -------------------------------------------- */
1167
1168PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1169    const char *string,         /* MBCS encoded string */
1170    Py_ssize_t length,              /* size of string */
1171    const char *errors          /* error handling */
1172    );
1173
1174PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1175    const char *string,         /* MBCS encoded string */
1176    Py_ssize_t length,          /* size of string */
1177    const char *errors,         /* error handling */
1178    Py_ssize_t *consumed        /* bytes consumed */
1179    );
1180
1181PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1182    PyObject *unicode           /* Unicode object */
1183    );
1184
1185#ifndef Py_LIMITED_API
1186PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1187    const Py_UNICODE *data,     /* Unicode char buffer */
1188    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1189    const char *errors          /* error handling */
1190    );
1191#endif
1192
1193#endif /* MS_WIN32 */
1194
1195/* --- Decimal Encoder ---------------------------------------------------- */
1196
1197/* Takes a Unicode string holding a decimal value and writes it into
1198   an output buffer using standard ASCII digit codes.
1199
1200   The output buffer has to provide at least length+1 bytes of storage
1201   area. The output string is 0-terminated.
1202
1203   The encoder converts whitespace to ' ', decimal characters to their
1204   corresponding ASCII digit and all other Latin-1 characters except
1205   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1206   are treated as errors. This includes embedded NULL bytes.
1207
1208   Error handling is defined by the errors argument:
1209
1210      NULL or "strict": raise a ValueError
1211      "ignore": ignore the wrong characters (these are not copied to the
1212                output buffer)
1213      "replace": replaces illegal characters with '?'
1214
1215   Returns 0 on success, -1 on failure.
1216
1217*/
1218
1219#ifndef Py_LIMITED_API
1220PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1221    Py_UNICODE *s,              /* Unicode buffer */
1222    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1223    char *output,               /* Output buffer; must have size >= length */
1224    const char *errors          /* error handling */
1225    );
1226#endif
1227
1228/* --- File system encoding ---------------------------------------------- */
1229
1230/* ParseTuple converter: encode str objects to bytes using
1231   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1232
1233PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1234
1235/* ParseTuple converter: decode bytes objects to unicode using
1236   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1237
1238PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1239
1240/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1241   and the "surrogateescape" error handler.
1242
1243   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1244   encoding.
1245
1246   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1247*/
1248
1249PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1250    const char *s               /* encoded string */
1251    );
1252
1253/* Decode a string using Py_FileSystemDefaultEncoding
1254   and the "surrogateescape" error handler.
1255
1256   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1257   encoding.
1258*/
1259
1260PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1261    const char *s,               /* encoded string */
1262    Py_ssize_t size              /* size */
1263    );
1264
1265/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1266   "surrogateescape" error handler, and return bytes.
1267
1268   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1269   encoding.
1270*/
1271
1272PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1273    PyObject *unicode
1274    );
1275
1276/* --- Methods & Slots ----------------------------------------------------
1277
1278   These are capable of handling Unicode objects and strings on input
1279   (we refer to them as strings in the descriptions) and return
1280   Unicode objects or integers as apporpriate. */
1281
1282/* Concat two strings giving a new Unicode string. */
1283
1284PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1285    PyObject *left,             /* Left string */
1286    PyObject *right             /* Right string */
1287    );
1288
1289/* Concat two strings and put the result in *pleft
1290   (sets *pleft to NULL on error) */
1291
1292PyAPI_FUNC(void) PyUnicode_Append(
1293    PyObject **pleft,           /* Pointer to left string */
1294    PyObject *right             /* Right string */
1295    );
1296
1297/* Concat two strings, put the result in *pleft and drop the right object
1298   (sets *pleft to NULL on error) */
1299
1300PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1301    PyObject **pleft,           /* Pointer to left string */
1302    PyObject *right             /* Right string */
1303    );
1304
1305/* Split a string giving a list of Unicode strings.
1306
1307   If sep is NULL, splitting will be done at all whitespace
1308   substrings. Otherwise, splits occur at the given separator.
1309
1310   At most maxsplit splits will be done. If negative, no limit is set.
1311
1312   Separators are not included in the resulting list.
1313
1314*/
1315
1316PyAPI_FUNC(PyObject*) PyUnicode_Split(
1317    PyObject *s,                /* String to split */
1318    PyObject *sep,              /* String separator */
1319    Py_ssize_t maxsplit         /* Maxsplit count */
1320    );
1321
1322/* Dito, but split at line breaks.
1323
1324   CRLF is considered to be one line break. Line breaks are not
1325   included in the resulting list. */
1326
1327PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1328    PyObject *s,                /* String to split */
1329    int keepends                /* If true, line end markers are included */
1330    );
1331
1332/* Partition a string using a given separator. */
1333
1334PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1335    PyObject *s,                /* String to partition */
1336    PyObject *sep               /* String separator */
1337    );
1338
1339/* Partition a string using a given separator, searching from the end of the
1340   string. */
1341
1342PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1343    PyObject *s,                /* String to partition */
1344    PyObject *sep               /* String separator */
1345    );
1346
1347/* Split a string giving a list of Unicode strings.
1348
1349   If sep is NULL, splitting will be done at all whitespace
1350   substrings. Otherwise, splits occur at the given separator.
1351
1352   At most maxsplit splits will be done. But unlike PyUnicode_Split
1353   PyUnicode_RSplit splits from the end of the string. If negative,
1354   no limit is set.
1355
1356   Separators are not included in the resulting list.
1357
1358*/
1359
1360PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1361    PyObject *s,                /* String to split */
1362    PyObject *sep,              /* String separator */
1363    Py_ssize_t maxsplit         /* Maxsplit count */
1364    );
1365
1366/* Translate a string by applying a character mapping table to it and
1367   return the resulting Unicode object.
1368
1369   The mapping table must map Unicode ordinal integers to Unicode
1370   ordinal integers or None (causing deletion of the character).
1371
1372   Mapping tables may be dictionaries or sequences. Unmapped character
1373   ordinals (ones which cause a LookupError) are left untouched and
1374   are copied as-is.
1375
1376*/
1377
1378PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1379    PyObject *str,              /* String */
1380    PyObject *table,            /* Translate table */
1381    const char *errors          /* error handling */
1382    );
1383
1384/* Join a sequence of strings using the given separator and return
1385   the resulting Unicode string. */
1386
1387PyAPI_FUNC(PyObject*) PyUnicode_Join(
1388    PyObject *separator,        /* Separator string */
1389    PyObject *seq               /* Sequence object */
1390    );
1391
1392/* Return 1 if substr matches str[start:end] at the given tail end, 0
1393   otherwise. */
1394
1395PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1396    PyObject *str,              /* String */
1397    PyObject *substr,           /* Prefix or Suffix string */
1398    Py_ssize_t start,           /* Start index */
1399    Py_ssize_t end,             /* Stop index */
1400    int direction               /* Tail end: -1 prefix, +1 suffix */
1401    );
1402
1403/* Return the first position of substr in str[start:end] using the
1404   given search direction or -1 if not found. -2 is returned in case
1405   an error occurred and an exception is set. */
1406
1407PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1408    PyObject *str,              /* String */
1409    PyObject *substr,           /* Substring to find */
1410    Py_ssize_t start,           /* Start index */
1411    Py_ssize_t end,             /* Stop index */
1412    int direction               /* Find direction: +1 forward, -1 backward */
1413    );
1414
1415/* Count the number of occurrences of substr in str[start:end]. */
1416
1417PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1418    PyObject *str,              /* String */
1419    PyObject *substr,           /* Substring to count */
1420    Py_ssize_t start,           /* Start index */
1421    Py_ssize_t end              /* Stop index */
1422    );
1423
1424/* Replace at most maxcount occurrences of substr in str with replstr
1425   and return the resulting Unicode object. */
1426
1427PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1428    PyObject *str,              /* String */
1429    PyObject *substr,           /* Substring to find */
1430    PyObject *replstr,          /* Substring to replace */
1431    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1432                                   -1 = all */
1433    );
1434
1435/* Compare two strings and return -1, 0, 1 for less than, equal,
1436   greater than resp. */
1437
1438PyAPI_FUNC(int) PyUnicode_Compare(
1439    PyObject *left,             /* Left string */
1440    PyObject *right             /* Right string */
1441    );
1442
1443PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1444    PyObject *left,
1445    const char *right
1446    );
1447
1448/* Rich compare two strings and return one of the following:
1449
1450   - NULL in case an exception was raised
1451   - Py_True or Py_False for successfuly comparisons
1452   - Py_NotImplemented in case the type combination is unknown
1453
1454   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1455   case the conversion of the arguments to Unicode fails with a
1456   UnicodeDecodeError.
1457
1458   Possible values for op:
1459
1460     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1461
1462*/
1463
1464PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1465    PyObject *left,             /* Left string */
1466    PyObject *right,            /* Right string */
1467    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1468    );
1469
1470/* Apply a argument tuple or dictionary to a format string and return
1471   the resulting Unicode string. */
1472
1473PyAPI_FUNC(PyObject *) PyUnicode_Format(
1474    PyObject *format,           /* Format string */
1475    PyObject *args              /* Argument tuple or dictionary */
1476    );
1477
1478/* Checks whether element is contained in container and return 1/0
1479   accordingly.
1480
1481   element has to coerce to an one element Unicode string. -1 is
1482   returned in case of an error. */
1483
1484PyAPI_FUNC(int) PyUnicode_Contains(
1485    PyObject *container,        /* Container string */
1486    PyObject *element           /* Element string */
1487    );
1488
1489/* Checks whether argument is a valid identifier. */
1490
1491PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1492
1493#ifndef Py_LIMITED_API
1494/* Externally visible for str.strip(unicode) */
1495PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1496    PyUnicodeObject *self,
1497    int striptype,
1498    PyObject *sepobj
1499    );
1500#endif
1501
1502/* Using the current locale, insert the thousands grouping
1503   into the string pointed to by buffer.  For the argument descriptions,
1504   see Objects/stringlib/localeutil.h */
1505
1506#ifndef Py_LIMITED_API
1507PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1508                                                   Py_ssize_t n_buffer,
1509                                                   Py_UNICODE *digits,
1510                                                   Py_ssize_t n_digits,
1511                                                   Py_ssize_t min_width);
1512#endif
1513
1514/* Using explicit passed-in values, insert the thousands grouping
1515   into the string pointed to by buffer.  For the argument descriptions,
1516   see Objects/stringlib/localeutil.h */
1517#ifndef Py_LIMITED_API
1518PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer,
1519                                                   Py_ssize_t n_buffer,
1520                                                   Py_UNICODE *digits,
1521                                                   Py_ssize_t n_digits,
1522                                                   Py_ssize_t min_width,
1523                                                   const char *grouping,
1524                                                   const char *thousands_sep);
1525#endif
1526/* === Characters Type APIs =============================================== */
1527
1528/* Helper array used by Py_UNICODE_ISSPACE(). */
1529
1530#ifndef Py_LIMITED_API
1531PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1532
1533/* These should not be used directly. Use the Py_UNICODE_IS* and
1534   Py_UNICODE_TO* macros instead.
1535
1536   These APIs are implemented in Objects/unicodectype.c.
1537
1538*/
1539
1540PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1541    Py_UCS4 ch       /* Unicode character */
1542    );
1543
1544PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1545    Py_UCS4 ch       /* Unicode character */
1546    );
1547
1548PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1549    Py_UCS4 ch       /* Unicode character */
1550    );
1551
1552PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1553    Py_UCS4 ch       /* Unicode character */
1554    );
1555
1556PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1557    Py_UCS4 ch       /* Unicode character */
1558    );
1559
1560PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1561    const Py_UCS4 ch         /* Unicode character */
1562    );
1563
1564PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1565    const Py_UCS4 ch         /* Unicode character */
1566    );
1567
1568PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1569    Py_UCS4 ch       /* Unicode character */
1570    );
1571
1572PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1573    Py_UCS4 ch       /* Unicode character */
1574    );
1575
1576PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1577    Py_UCS4 ch       /* Unicode character */
1578    );
1579
1580PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1581    Py_UCS4 ch       /* Unicode character */
1582    );
1583
1584PyAPI_FUNC(int) _PyUnicode_ToDigit(
1585    Py_UCS4 ch       /* Unicode character */
1586    );
1587
1588PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1589    Py_UCS4 ch       /* Unicode character */
1590    );
1591
1592PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1593    Py_UCS4 ch       /* Unicode character */
1594    );
1595
1596PyAPI_FUNC(int) _PyUnicode_IsDigit(
1597    Py_UCS4 ch       /* Unicode character */
1598    );
1599
1600PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1601    Py_UCS4 ch       /* Unicode character */
1602    );
1603
1604PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1605    Py_UCS4 ch       /* Unicode character */
1606    );
1607
1608PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1609    Py_UCS4 ch       /* Unicode character */
1610    );
1611
1612PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1613    const Py_UNICODE *u
1614    );
1615
1616PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1617    Py_UNICODE *s1,
1618    const Py_UNICODE *s2);
1619
1620PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1621    Py_UNICODE *s1, const Py_UNICODE *s2);
1622
1623PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1624    Py_UNICODE *s1,
1625    const Py_UNICODE *s2,
1626    size_t n);
1627
1628PyAPI_FUNC(int) Py_UNICODE_strcmp(
1629    const Py_UNICODE *s1,
1630    const Py_UNICODE *s2
1631    );
1632
1633PyAPI_FUNC(int) Py_UNICODE_strncmp(
1634    const Py_UNICODE *s1,
1635    const Py_UNICODE *s2,
1636    size_t n
1637    );
1638
1639PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1640    const Py_UNICODE *s,
1641    Py_UNICODE c
1642    );
1643
1644PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
1645    const Py_UNICODE *s,
1646    Py_UNICODE c
1647    );
1648
1649/* Create a copy of a unicode string ending with a nul character. Return NULL
1650   and raise a MemoryError exception on memory allocation failure, otherwise
1651   return a new allocated buffer (use PyMem_Free() to free the buffer). */
1652
1653PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
1654    PyObject *unicode
1655    );
1656#endif /* Py_LIMITED_API */
1657
1658#ifdef __cplusplus
1659}
1660#endif
1661#endif /* !Py_UNICODEOBJECT_H */
1662