unicodeobject.h revision 190d79e5c648174b550de2bef75d1b4addf0d625
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal (see file Misc/unicode.txt).
11
12Copyright (c) Corporation for National Research Initiatives.
13
14
15 Original header:
16 --------------------------------------------------------------------
17
18 * Yet another Unicode string type for Python.  This type supports the
19 * 16-bit Basic Multilingual Plane (BMP) only.
20 *
21 * Written by Fredrik Lundh, January 1999.
22 *
23 * Copyright (c) 1999 by Secret Labs AB.
24 * Copyright (c) 1999 by Fredrik Lundh.
25 *
26 * fredrik@pythonware.com
27 * http://www.pythonware.com
28 *
29 * --------------------------------------------------------------------
30 * This Unicode String Type is
31 *
32 * Copyright (c) 1999 by Secret Labs AB
33 * Copyright (c) 1999 by Fredrik Lundh
34 *
35 * By obtaining, using, and/or copying this software and/or its
36 * associated documentation, you agree that you have read, understood,
37 * and will comply with the following terms and conditions:
38 *
39 * Permission to use, copy, modify, and distribute this software and its
40 * associated documentation for any purpose and without fee is hereby
41 * granted, provided that the above copyright notice appears in all
42 * copies, and that both that copyright notice and this permission notice
43 * appear in supporting documentation, and that the name of Secret Labs
44 * AB or the author not be used in advertising or publicity pertaining to
45 * distribution of the software without specific, written prior
46 * permission.
47 *
48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
50 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
55 * -------------------------------------------------------------------- */
56
57#include <ctype.h>
58
59/* === Internal API ======================================================= */
60
61/* --- Internal Unicode Format -------------------------------------------- */
62
63/* Python 3.x requires unicode */
64#define Py_USING_UNICODE
65
66/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
67   properly set, but the default rules below doesn't set it.  I'll
68   sort this out some other day -- fredrik@pythonware.com */
69
70#ifndef Py_UNICODE_SIZE
71#error Must define Py_UNICODE_SIZE
72#endif
73
74/* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
75   strings are stored as UCS-2 (with limited support for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h", "wctype.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Defaults for various platforms */
87#ifndef PY_UNICODE_TYPE
88
89/* Windows has a usable wchar_t type (unless we're using UCS-4) */
90# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
91#  define HAVE_USABLE_WCHAR_T
92#  define PY_UNICODE_TYPE wchar_t
93# endif
94
95# if defined(Py_UNICODE_WIDE)
96#  define PY_UNICODE_TYPE Py_UCS4
97# endif
98
99#endif
100
101/* If the compiler provides a wchar_t type we try to support it
102   through the interface functions PyUnicode_FromWideChar() and
103   PyUnicode_AsWideChar(). */
104
105#ifdef HAVE_USABLE_WCHAR_T
106# ifndef HAVE_WCHAR_H
107#  define HAVE_WCHAR_H
108# endif
109#endif
110
111#ifdef HAVE_WCHAR_H
112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
113# ifdef _HAVE_BSDI
114#  include <time.h>
115# endif
116#  include <wchar.h>
117#endif
118
119/*
120 * Use this typedef when you need to represent a UTF-16 surrogate pair
121 * as single unsigned integer.
122 */
123#if SIZEOF_INT >= 4
124typedef unsigned int Py_UCS4;
125#elif SIZEOF_LONG >= 4
126typedef unsigned long Py_UCS4;
127#endif
128
129typedef PY_UNICODE_TYPE Py_UNICODE;
130
131/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
132
133/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
134   produce different external names and thus cause import errors in
135   case Python interpreters and extensions with mixed compiled in
136   Unicode width assumptions are combined. */
137
138#ifndef Py_UNICODE_WIDE
139
140# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
141# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
142# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
143# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
144# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
145# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
146# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
147# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
148# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
149# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
150# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
151# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
152# define PyUnicode_Compare PyUnicodeUCS2_Compare
153# define PyUnicode_Concat PyUnicodeUCS2_Concat
154# define PyUnicode_Append PyUnicodeUCS2_Append
155# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
156# define PyUnicode_Contains PyUnicodeUCS2_Contains
157# define PyUnicode_Count PyUnicodeUCS2_Count
158# define PyUnicode_Decode PyUnicodeUCS2_Decode
159# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
160# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
161# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
162# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
163# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
164# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
165# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
166# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
167# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
168# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
169# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
170# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
171# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
172# define PyUnicode_Encode PyUnicodeUCS2_Encode
173# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
174# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
175# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
176# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
177# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
178# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
179# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
180# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
181# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
182# define PyUnicode_Find PyUnicodeUCS2_Find
183# define PyUnicode_Format PyUnicodeUCS2_Format
184# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
185# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
186# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
187# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
188# define PyUnicode_FromString PyUnicodeUCS2_FromString
189# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
190# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
191# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
192# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
193# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
194# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
195# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
196# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
197# define PyUnicode_Join PyUnicodeUCS2_Join
198# define PyUnicode_Partition PyUnicodeUCS2_Partition
199# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
200# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
201# define PyUnicode_Replace PyUnicodeUCS2_Replace
202# define PyUnicode_Resize PyUnicodeUCS2_Resize
203# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
204# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
205# define PyUnicode_Split PyUnicodeUCS2_Split
206# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
207# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
208# define PyUnicode_Translate PyUnicodeUCS2_Translate
209# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
210# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
211# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
212# define _PyUnicode_Init _PyUnicodeUCS2_Init
213# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
214# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
215# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
216# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
217# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
218# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
219# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
220# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
221# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
222# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
223# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
224# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
225# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
226# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
227# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
228# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
229# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
230
231#else
232
233# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
234# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
235# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
236# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
237# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
238# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
239# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
240# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
241# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
242# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
243# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
244# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
245# define PyUnicode_Compare PyUnicodeUCS4_Compare
246# define PyUnicode_Concat PyUnicodeUCS4_Concat
247# define PyUnicode_Append PyUnicodeUCS4_Append
248# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
249# define PyUnicode_Contains PyUnicodeUCS4_Contains
250# define PyUnicode_Count PyUnicodeUCS4_Count
251# define PyUnicode_Decode PyUnicodeUCS4_Decode
252# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
253# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
254# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
255# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
256# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
257# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
258# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
259# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
260# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
261# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
262# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
263# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
264# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
265# define PyUnicode_Encode PyUnicodeUCS4_Encode
266# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
267# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
268# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
269# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
270# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
271# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
272# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
273# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
274# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
275# define PyUnicode_Find PyUnicodeUCS4_Find
276# define PyUnicode_Format PyUnicodeUCS4_Format
277# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
278# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
279# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
280# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
281# define PyUnicode_FromString PyUnicodeUCS4_FromString
282# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
283# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
284# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
285# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
286# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
287# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
288# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
289# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
290# define PyUnicode_Join PyUnicodeUCS4_Join
291# define PyUnicode_Partition PyUnicodeUCS4_Partition
292# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
293# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
294# define PyUnicode_Replace PyUnicodeUCS4_Replace
295# define PyUnicode_Resize PyUnicodeUCS4_Resize
296# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
297# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
298# define PyUnicode_Split PyUnicodeUCS4_Split
299# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
300# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
301# define PyUnicode_Translate PyUnicodeUCS4_Translate
302# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
303# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
304# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
305# define _PyUnicode_Init _PyUnicodeUCS4_Init
306# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
307# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
308# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
309# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
310# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
311# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
312# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
313# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
314# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
315# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
316# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
317# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
318# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
319# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
320# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
321# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
322# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
323
324
325#endif
326
327/* --- Internal Unicode Operations ---------------------------------------- */
328
329/* If you want Python to use the compiler's wctype.h functions instead
330   of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
331   configure Python using --with-wctype-functions.  This reduces the
332   interpreter's code size. */
333
334#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
335
336#include <wctype.h>
337
338#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
339
340#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
341#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
342#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
343#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
344
345#define Py_UNICODE_TOLOWER(ch) towlower(ch)
346#define Py_UNICODE_TOUPPER(ch) towupper(ch)
347#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
348
349#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
350#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
351#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
352
353#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
354#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
355#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
356
357#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
358
359#else
360
361/* Since splitting on whitespace is an important use case, and whitespace
362   in most situations is solely ASCII whitespace, we optimize for the common
363   case by using a quick look-up table with an inlined check.
364 */
365extern const unsigned char _Py_ascii_whitespace[];
366
367#define Py_UNICODE_ISSPACE(ch) \
368	((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
369
370#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
371#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
372#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
373#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
374
375#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
376#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
377#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
378
379#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
380#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
381#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
382
383#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
384#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
385#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
386
387#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
388
389#endif
390
391#define Py_UNICODE_ISALNUM(ch) \
392       (Py_UNICODE_ISALPHA(ch) || \
393        Py_UNICODE_ISDECIMAL(ch) || \
394        Py_UNICODE_ISDIGIT(ch) || \
395        Py_UNICODE_ISNUMERIC(ch))
396
397#define Py_UNICODE_COPY(target, source, length)				\
398	Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
399
400#define Py_UNICODE_FILL(target, value, length) do\
401    {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
402        for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
403    } while (0)
404
405/* check if substring matches at given offset.  the offset must be
406   valid, and the substring must not be empty */
407#define Py_UNICODE_MATCH(string, offset, substring) \
408    ((*((string)->str + (offset)) == *((substring)->str)) && \
409    ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
410     !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
411
412#ifdef __cplusplus
413extern "C" {
414#endif
415
416/* --- Unicode Type ------------------------------------------------------- */
417
418typedef struct {
419    PyObject_HEAD
420    Py_ssize_t length;		/* Length of raw Unicode data in buffer */
421    Py_UNICODE *str;		/* Raw Unicode buffer */
422    long hash;			/* Hash value; -1 if not set */
423    int state;			/* != 0 if interned. In this case the two
424    				 * references from the dictionary to this object
425    				 * are *not* counted in ob_refcnt. */
426    PyObject *defenc;		/* (Default) Encoded version as Python
427				   string, or NULL; this is used for
428				   implementing the buffer protocol */
429} PyUnicodeObject;
430
431PyAPI_DATA(PyTypeObject) PyUnicode_Type;
432PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
433
434#define SSTATE_NOT_INTERNED 0
435#define SSTATE_INTERNED_MORTAL 1
436#define SSTATE_INTERNED_IMMORTAL 2
437
438#define PyUnicode_Check(op) \
439                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
440#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
441
442/* Fast access macros */
443#define PyUnicode_GET_SIZE(op) \
444        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
445#define PyUnicode_GET_DATA_SIZE(op) \
446        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
447#define PyUnicode_AS_UNICODE(op) \
448        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
449#define PyUnicode_AS_DATA(op) \
450        (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
451
452/* --- Constants ---------------------------------------------------------- */
453
454/* This Unicode character will be used as replacement character during
455   decoding if the errors argument is set to "replace". Note: the
456   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
457   Unicode 3.0. */
458
459#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
460
461/* === Public API ========================================================= */
462
463/* --- Plain Py_UNICODE --------------------------------------------------- */
464
465/* Create a Unicode Object from the Py_UNICODE buffer u of the given
466   size.
467
468   u may be NULL which causes the contents to be undefined. It is the
469   user's responsibility to fill in the needed data afterwards. Note
470   that modifying the Unicode object contents after construction is
471   only allowed if u was set to NULL.
472
473   The buffer is copied into the new object. */
474
475PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
476    const Py_UNICODE *u,        /* Unicode buffer */
477    Py_ssize_t size             /* size of buffer */
478    );
479
480/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
481PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
482    const char *u,        /* char buffer */
483    Py_ssize_t size       /* size of buffer */
484    );
485
486/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
487   Latin-1 encoded bytes */
488PyAPI_FUNC(PyObject*) PyUnicode_FromString(
489    const char *u        /* string */
490    );
491
492/* Return a read-only pointer to the Unicode object's internal
493   Py_UNICODE buffer. */
494
495PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
496    PyObject *unicode	 	/* Unicode object */
497    );
498
499/* Get the length of the Unicode object. */
500
501PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
502    PyObject *unicode	 	/* Unicode object */
503    );
504
505/* Get the maximum ordinal for a Unicode character. */
506PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
507
508/* Resize an already allocated Unicode object to the new size length.
509
510   *unicode is modified to point to the new (resized) object and 0
511   returned on success.
512
513   This API may only be called by the function which also called the
514   Unicode constructor. The refcount on the object must be 1. Otherwise,
515   an error is returned.
516
517   Error handling is implemented as follows: an exception is set, -1
518   is returned and *unicode left untouched.
519
520*/
521
522PyAPI_FUNC(int) PyUnicode_Resize(
523    PyObject **unicode,		/* Pointer to the Unicode object */
524    Py_ssize_t length		/* New length */
525    );
526
527/* Coerce obj to an Unicode object and return a reference with
528   *incremented* refcount.
529
530   Coercion is done in the following way:
531
532   1. String and other char buffer compatible objects are decoded
533      under the assumptions that they contain data using the current
534      default encoding. Decoding is done in "strict" mode.
535
536   2. All other objects (including Unicode objects) raise an
537      exception.
538
539   The API returns NULL in case of an error. The caller is responsible
540   for decref'ing the returned objects.
541
542*/
543
544PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
545    register PyObject *obj, 	/* Object */
546    const char *encoding,       /* encoding */
547    const char *errors          /* error handling */
548    );
549
550/* Coerce obj to an Unicode object and return a reference with
551   *incremented* refcount.
552
553   Unicode objects are passed back as-is (subclasses are converted to
554   true Unicode objects), all other objects are delegated to
555   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
556   using the default encoding as basis for decoding the object.
557
558   The API returns NULL in case of an error. The caller is responsible
559   for decref'ing the returned objects.
560
561*/
562
563PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
564    register PyObject *obj 	/* Object */
565    );
566
567PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
568PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
569
570PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
571PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
572PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
573PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
574
575/* Use only if you know it's a string */
576#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
577
578/* --- wchar_t support for platforms which support it --------------------- */
579
580#ifdef HAVE_WCHAR_H
581
582/* Create a Unicode Object from the whcar_t buffer w of the given
583   size.
584
585   The buffer is copied into the new object. */
586
587PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
588    register const wchar_t *w,  /* wchar_t buffer */
589    Py_ssize_t size             /* size of buffer */
590    );
591
592/* Copies the Unicode Object contents into the wchar_t buffer w.  At
593   most size wchar_t characters are copied.
594
595   Note that the resulting wchar_t string may or may not be
596   0-terminated.  It is the responsibility of the caller to make sure
597   that the wchar_t string is 0-terminated in case this is required by
598   the application.
599
600   Returns the number of wchar_t characters copied (excluding a
601   possibly trailing 0-termination character) or -1 in case of an
602   error. */
603
604PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
605    PyUnicodeObject *unicode,   /* Unicode object */
606    register wchar_t *w,        /* wchar_t buffer */
607    Py_ssize_t size             /* size of buffer */
608    );
609
610#endif
611
612/* --- Unicode ordinals --------------------------------------------------- */
613
614/* Create a Unicode Object from the given Unicode code point ordinal.
615
616   The ordinal must be in range(0x10000) on narrow Python builds
617   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
618   raised in case it is not.
619
620*/
621
622PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
623
624/* === Builtin Codecs =====================================================
625
626   Many of these APIs take two arguments encoding and errors. These
627   parameters encoding and errors have the same semantics as the ones
628   of the builtin unicode() API.
629
630   Setting encoding to NULL causes the default encoding to be used.
631
632   Error handling is set by errors which may also be set to NULL
633   meaning to use the default handling defined for the codec. Default
634   error handling for all builtin codecs is "strict" (ValueErrors are
635   raised).
636
637   The codecs all use a similar interface. Only deviation from the
638   generic ones are documented.
639
640*/
641
642/* --- Manage the default encoding ---------------------------------------- */
643
644/* Return a Python string holding the default encoded value of the
645   Unicode object.
646
647   The resulting string is cached in the Unicode object for subsequent
648   usage by this function. The cached version is needed to implement
649   the character buffer interface and will live (at least) as long as
650   the Unicode object itself.
651
652   The refcount of the string is *not* incremented.
653
654   *** Exported for internal use by the interpreter only !!! ***
655
656*/
657
658PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
659    PyObject *, const char *);
660
661/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
662
663   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
664   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
665   invalid characters with '?'.
666
667   The function is intended to be used for paths and file names only
668   during bootstrapping process where the codecs are not set up.
669*/
670
671PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
672    const char *s               /* encoded string */
673    );
674
675PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
676    const char *s,               /* encoded string */
677    Py_ssize_t size              /* size */
678    );
679
680
681/* Return a char* holding the UTF-8 encoded value of the
682   Unicode object.
683
684   DEPRECATED: use PyUnicode_AsStringAndSize() instead.
685*/
686
687PyAPI_FUNC(char *) PyUnicode_AsStringAndSize(PyObject*, Py_ssize_t *);
688
689/* Returns the UTF-8 encoding, and its size.
690
691   If the output argument is NULL, no size is stored.
692 */
693
694PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*);
695
696/* Returns the UTF-8 encoding.
697
698   This is equivalent to PyUnicode_AsStringAndSize(x, NULL).
699
700 */
701
702PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
703
704/* Sets the currently active default encoding.
705
706   Returns 0 on success, -1 in case of an error.
707
708 */
709
710PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
711    const char *encoding	/* Encoding name in standard form */
712    );
713
714/* --- Generic Codecs ----------------------------------------------------- */
715
716/* Create a Unicode object by decoding the encoded string s of the
717   given size. */
718
719PyAPI_FUNC(PyObject*) PyUnicode_Decode(
720    const char *s,              /* encoded string */
721    Py_ssize_t size,            /* size of buffer */
722    const char *encoding,       /* encoding */
723    const char *errors          /* error handling */
724    );
725
726/* Encodes a Py_UNICODE buffer of the given size and returns a
727   Python string object. */
728
729PyAPI_FUNC(PyObject*) PyUnicode_Encode(
730    const Py_UNICODE *s,        /* Unicode char buffer */
731    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
732    const char *encoding,       /* encoding */
733    const char *errors          /* error handling */
734    );
735
736/* Encodes a Unicode object and returns the result as Python
737   object. */
738
739PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
740    PyObject *unicode,	 	/* Unicode object */
741    const char *encoding,	/* encoding */
742    const char *errors		/* error handling */
743    );
744
745/* Encodes a Unicode object and returns the result as Python string
746   object. */
747
748PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
749    PyObject *unicode,	 	/* Unicode object */
750    const char *encoding,	/* encoding */
751    const char *errors		/* error handling */
752    );
753
754PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
755    PyObject* string            /* 256 character map */
756   );
757
758
759/* --- UTF-7 Codecs ------------------------------------------------------- */
760
761PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
762    const char *string, 	/* UTF-7 encoded string */
763    Py_ssize_t length,	 	/* size of string */
764    const char *errors		/* error handling */
765    );
766
767PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
768    const char *string, 	/* UTF-7 encoded string */
769    Py_ssize_t length,	 	/* size of string */
770    const char *errors,		/* error handling */
771    Py_ssize_t *consumed	/* bytes consumed */
772    );
773
774PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
775    const Py_UNICODE *data, 	/* Unicode char buffer */
776    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
777    int encodeSetO,             /* force the encoder to encode characters in
778                                   Set O, as described in RFC2152 */
779    int encodeWhiteSpace,       /* force the encoder to encode space, tab,
780                                   carriage return and linefeed characters */
781    const char *errors		/* error handling */
782    );
783
784/* --- UTF-8 Codecs ------------------------------------------------------- */
785
786PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
787    const char *string, 	/* UTF-8 encoded string */
788    Py_ssize_t length,	 	/* size of string */
789    const char *errors		/* error handling */
790    );
791
792PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
793    const char *string, 	/* UTF-8 encoded string */
794    Py_ssize_t length,	 	/* size of string */
795    const char *errors,		/* error handling */
796    Py_ssize_t *consumed	/* bytes consumed */
797    );
798
799PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
800    PyObject *unicode	 	/* Unicode object */
801    );
802
803PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
804    const Py_UNICODE *data, 	/* Unicode char buffer */
805    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
806    const char *errors		/* error handling */
807    );
808
809/* --- UTF-32 Codecs ------------------------------------------------------ */
810
811/* Decodes length bytes from a UTF-32 encoded buffer string and returns
812   the corresponding Unicode object.
813
814   errors (if non-NULL) defines the error handling. It defaults
815   to "strict".
816
817   If byteorder is non-NULL, the decoder starts decoding using the
818   given byte order:
819
820	*byteorder == -1: little endian
821	*byteorder == 0:  native order
822	*byteorder == 1:  big endian
823
824   In native mode, the first four bytes of the stream are checked for a
825   BOM mark. If found, the BOM mark is analysed, the byte order
826   adjusted and the BOM skipped.  In the other modes, no BOM mark
827   interpretation is done. After completion, *byteorder is set to the
828   current byte order at the end of input data.
829
830   If byteorder is NULL, the codec starts in native order mode.
831
832*/
833
834PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
835    const char *string, 	/* UTF-32 encoded string */
836    Py_ssize_t length,	 	/* size of string */
837    const char *errors,		/* error handling */
838    int *byteorder		/* pointer to byteorder to use
839				   0=native;-1=LE,1=BE; updated on
840				   exit */
841    );
842
843PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
844    const char *string, 	/* UTF-32 encoded string */
845    Py_ssize_t length,	 	/* size of string */
846    const char *errors,		/* error handling */
847    int *byteorder,		/* pointer to byteorder to use
848				   0=native;-1=LE,1=BE; updated on
849				   exit */
850    Py_ssize_t *consumed	/* bytes consumed */
851    );
852
853/* Returns a Python string using the UTF-32 encoding in native byte
854   order. The string always starts with a BOM mark.  */
855
856PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
857    PyObject *unicode	 	/* Unicode object */
858    );
859
860/* Returns a Python string object holding the UTF-32 encoded value of
861   the Unicode data.
862
863   If byteorder is not 0, output is written according to the following
864   byte order:
865
866   byteorder == -1: little endian
867   byteorder == 0:  native byte order (writes a BOM mark)
868   byteorder == 1:  big endian
869
870   If byteorder is 0, the output string will always start with the
871   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
872   prepended.
873
874*/
875
876PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
877    const Py_UNICODE *data, 	/* Unicode char buffer */
878    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
879    const char *errors,		/* error handling */
880    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
881    );
882
883/* --- UTF-16 Codecs ------------------------------------------------------ */
884
885/* Decodes length bytes from a UTF-16 encoded buffer string and returns
886   the corresponding Unicode object.
887
888   errors (if non-NULL) defines the error handling. It defaults
889   to "strict".
890
891   If byteorder is non-NULL, the decoder starts decoding using the
892   given byte order:
893
894	*byteorder == -1: little endian
895	*byteorder == 0:  native order
896	*byteorder == 1:  big endian
897
898   In native mode, the first two bytes of the stream are checked for a
899   BOM mark. If found, the BOM mark is analysed, the byte order
900   adjusted and the BOM skipped.  In the other modes, no BOM mark
901   interpretation is done. After completion, *byteorder is set to the
902   current byte order at the end of input data.
903
904   If byteorder is NULL, the codec starts in native order mode.
905
906*/
907
908PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
909    const char *string, 	/* UTF-16 encoded string */
910    Py_ssize_t length,	 	/* size of string */
911    const char *errors,		/* error handling */
912    int *byteorder		/* pointer to byteorder to use
913				   0=native;-1=LE,1=BE; updated on
914				   exit */
915    );
916
917PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
918    const char *string, 	/* UTF-16 encoded string */
919    Py_ssize_t length,	 	/* size of string */
920    const char *errors,		/* error handling */
921    int *byteorder,		/* pointer to byteorder to use
922				   0=native;-1=LE,1=BE; updated on
923				   exit */
924    Py_ssize_t *consumed	/* bytes consumed */
925    );
926
927/* Returns a Python string using the UTF-16 encoding in native byte
928   order. The string always starts with a BOM mark.  */
929
930PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
931    PyObject *unicode	 	/* Unicode object */
932    );
933
934/* Returns a Python string object holding the UTF-16 encoded value of
935   the Unicode data.
936
937   If byteorder is not 0, output is written according to the following
938   byte order:
939
940   byteorder == -1: little endian
941   byteorder == 0:  native byte order (writes a BOM mark)
942   byteorder == 1:  big endian
943
944   If byteorder is 0, the output string will always start with the
945   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
946   prepended.
947
948   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
949   UCS-2. This trick makes it possible to add full UTF-16 capabilities
950   at a later point without compromising the APIs.
951
952*/
953
954PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
955    const Py_UNICODE *data, 	/* Unicode char buffer */
956    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
957    const char *errors,		/* error handling */
958    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
959    );
960
961/* --- Unicode-Escape Codecs ---------------------------------------------- */
962
963PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
964    const char *string, 	/* Unicode-Escape encoded string */
965    Py_ssize_t length,	 	/* size of string */
966    const char *errors		/* error handling */
967    );
968
969PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
970    PyObject *unicode	 	/* Unicode object */
971    );
972
973PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
974    const Py_UNICODE *data, 	/* Unicode char buffer */
975    Py_ssize_t length	 	/* Number of Py_UNICODE chars to encode */
976    );
977
978/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
979
980PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
981    const char *string, 	/* Raw-Unicode-Escape encoded string */
982    Py_ssize_t length,	 	/* size of string */
983    const char *errors		/* error handling */
984    );
985
986PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
987    PyObject *unicode	 	/* Unicode object */
988    );
989
990PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
991    const Py_UNICODE *data, 	/* Unicode char buffer */
992    Py_ssize_t length	 	/* Number of Py_UNICODE chars to encode */
993    );
994
995/* --- Unicode Internal Codec ---------------------------------------------
996
997    Only for internal use in _codecsmodule.c */
998
999PyObject *_PyUnicode_DecodeUnicodeInternal(
1000    const char *string,
1001    Py_ssize_t length,
1002    const char *errors
1003    );
1004
1005/* --- Latin-1 Codecs -----------------------------------------------------
1006
1007   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1008
1009*/
1010
1011PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1012    const char *string, 	/* Latin-1 encoded string */
1013    Py_ssize_t length,	 	/* size of string */
1014    const char *errors		/* error handling */
1015    );
1016
1017PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1018    PyObject *unicode	 	/* Unicode object */
1019    );
1020
1021PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1022    const Py_UNICODE *data, 	/* Unicode char buffer */
1023    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1024    const char *errors		/* error handling */
1025    );
1026
1027/* --- ASCII Codecs -------------------------------------------------------
1028
1029   Only 7-bit ASCII data is excepted. All other codes generate errors.
1030
1031*/
1032
1033PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1034    const char *string, 	/* ASCII encoded string */
1035    Py_ssize_t length,	 	/* size of string */
1036    const char *errors		/* error handling */
1037    );
1038
1039PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1040    PyObject *unicode	 	/* Unicode object */
1041    );
1042
1043PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1044    const Py_UNICODE *data, 	/* Unicode char buffer */
1045    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1046    const char *errors		/* error handling */
1047    );
1048
1049/* --- Character Map Codecs -----------------------------------------------
1050
1051   This codec uses mappings to encode and decode characters.
1052
1053   Decoding mappings must map single string characters to single
1054   Unicode characters, integers (which are then interpreted as Unicode
1055   ordinals) or None (meaning "undefined mapping" and causing an
1056   error).
1057
1058   Encoding mappings must map single Unicode characters to single
1059   string characters, integers (which are then interpreted as Latin-1
1060   ordinals) or None (meaning "undefined mapping" and causing an
1061   error).
1062
1063   If a character lookup fails with a LookupError, the character is
1064   copied as-is meaning that its ordinal value will be interpreted as
1065   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1066   to contain those mappings which map characters to different code
1067   points.
1068
1069*/
1070
1071PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1072    const char *string, 	/* Encoded string */
1073    Py_ssize_t length,	 	/* size of string */
1074    PyObject *mapping,		/* character mapping
1075				   (char ordinal -> unicode ordinal) */
1076    const char *errors		/* error handling */
1077    );
1078
1079PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1080    PyObject *unicode,	 	/* Unicode object */
1081    PyObject *mapping		/* character mapping
1082				   (unicode ordinal -> char ordinal) */
1083    );
1084
1085PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1086    const Py_UNICODE *data, 	/* Unicode char buffer */
1087    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1088    PyObject *mapping,		/* character mapping
1089				   (unicode ordinal -> char ordinal) */
1090    const char *errors		/* error handling */
1091    );
1092
1093/* Translate a Py_UNICODE buffer of the given length by applying a
1094   character mapping table to it and return the resulting Unicode
1095   object.
1096
1097   The mapping table must map Unicode ordinal integers to Unicode
1098   ordinal integers or None (causing deletion of the character).
1099
1100   Mapping tables may be dictionaries or sequences. Unmapped character
1101   ordinals (ones which cause a LookupError) are left untouched and
1102   are copied as-is.
1103
1104*/
1105
1106PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1107    const Py_UNICODE *data, 	/* Unicode char buffer */
1108    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1109    PyObject *table,		/* Translate table */
1110    const char *errors		/* error handling */
1111    );
1112
1113#ifdef MS_WIN32
1114
1115/* --- MBCS codecs for Windows -------------------------------------------- */
1116
1117PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1118    const char *string,         /* MBCS encoded string */
1119    Py_ssize_t length,              /* size of string */
1120    const char *errors          /* error handling */
1121    );
1122
1123PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1124    const char *string,         /* MBCS encoded string */
1125    Py_ssize_t length,          /* size of string */
1126    const char *errors,         /* error handling */
1127    Py_ssize_t *consumed        /* bytes consumed */
1128    );
1129
1130PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1131    PyObject *unicode           /* Unicode object */
1132    );
1133
1134PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1135    const Py_UNICODE *data,     /* Unicode char buffer */
1136    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1137    const char *errors          /* error handling */
1138    );
1139
1140#endif /* MS_WIN32 */
1141
1142/* --- Decimal Encoder ---------------------------------------------------- */
1143
1144/* Takes a Unicode string holding a decimal value and writes it into
1145   an output buffer using standard ASCII digit codes.
1146
1147   The output buffer has to provide at least length+1 bytes of storage
1148   area. The output string is 0-terminated.
1149
1150   The encoder converts whitespace to ' ', decimal characters to their
1151   corresponding ASCII digit and all other Latin-1 characters except
1152   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1153   are treated as errors. This includes embedded NULL bytes.
1154
1155   Error handling is defined by the errors argument:
1156
1157      NULL or "strict": raise a ValueError
1158      "ignore": ignore the wrong characters (these are not copied to the
1159		output buffer)
1160      "replace": replaces illegal characters with '?'
1161
1162   Returns 0 on success, -1 on failure.
1163
1164*/
1165
1166PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1167    Py_UNICODE *s,		/* Unicode buffer */
1168    Py_ssize_t length,		/* Number of Py_UNICODE chars to encode */
1169    char *output,		/* Output buffer; must have size >= length */
1170    const char *errors		/* error handling */
1171    );
1172
1173/* --- Methods & Slots ----------------------------------------------------
1174
1175   These are capable of handling Unicode objects and strings on input
1176   (we refer to them as strings in the descriptions) and return
1177   Unicode objects or integers as apporpriate. */
1178
1179/* Concat two strings giving a new Unicode string. */
1180
1181PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1182    PyObject *left,	 	/* Left string */
1183    PyObject *right	 	/* Right string */
1184    );
1185
1186/* Concat two strings and put the result in *pleft
1187   (sets *pleft to NULL on error) */
1188
1189PyAPI_FUNC(void) PyUnicode_Append(
1190    PyObject **pleft,	 	/* Pointer to left string */
1191    PyObject *right	 	/* Right string */
1192    );
1193
1194/* Concat two strings, put the result in *pleft and drop the right object
1195   (sets *pleft to NULL on error) */
1196
1197PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1198    PyObject **pleft,	 	/* Pointer to left string */
1199    PyObject *right	 	/* Right string */
1200    );
1201
1202/* Split a string giving a list of Unicode strings.
1203
1204   If sep is NULL, splitting will be done at all whitespace
1205   substrings. Otherwise, splits occur at the given separator.
1206
1207   At most maxsplit splits will be done. If negative, no limit is set.
1208
1209   Separators are not included in the resulting list.
1210
1211*/
1212
1213PyAPI_FUNC(PyObject*) PyUnicode_Split(
1214    PyObject *s,		/* String to split */
1215    PyObject *sep,		/* String separator */
1216    Py_ssize_t maxsplit		/* Maxsplit count */
1217    );
1218
1219/* Dito, but split at line breaks.
1220
1221   CRLF is considered to be one line break. Line breaks are not
1222   included in the resulting list. */
1223
1224PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1225    PyObject *s,		/* String to split */
1226    int keepends		/* If true, line end markers are included */
1227    );
1228
1229/* Partition a string using a given separator. */
1230
1231PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1232    PyObject *s,		/* String to partition */
1233    PyObject *sep		/* String separator */
1234    );
1235
1236/* Partition a string using a given separator, searching from the end of the
1237   string. */
1238
1239PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1240    PyObject *s,		/* String to partition */
1241    PyObject *sep		/* String separator */
1242    );
1243
1244/* Split a string giving a list of Unicode strings.
1245
1246   If sep is NULL, splitting will be done at all whitespace
1247   substrings. Otherwise, splits occur at the given separator.
1248
1249   At most maxsplit splits will be done. But unlike PyUnicode_Split
1250   PyUnicode_RSplit splits from the end of the string. If negative,
1251   no limit is set.
1252
1253   Separators are not included in the resulting list.
1254
1255*/
1256
1257PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1258    PyObject *s,		/* String to split */
1259    PyObject *sep,		/* String separator */
1260    Py_ssize_t maxsplit		/* Maxsplit count */
1261    );
1262
1263/* Translate a string by applying a character mapping table to it and
1264   return the resulting Unicode object.
1265
1266   The mapping table must map Unicode ordinal integers to Unicode
1267   ordinal integers or None (causing deletion of the character).
1268
1269   Mapping tables may be dictionaries or sequences. Unmapped character
1270   ordinals (ones which cause a LookupError) are left untouched and
1271   are copied as-is.
1272
1273*/
1274
1275PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1276    PyObject *str,		/* String */
1277    PyObject *table,		/* Translate table */
1278    const char *errors		/* error handling */
1279    );
1280
1281/* Join a sequence of strings using the given separator and return
1282   the resulting Unicode string. */
1283
1284PyAPI_FUNC(PyObject*) PyUnicode_Join(
1285    PyObject *separator, 	/* Separator string */
1286    PyObject *seq	 	/* Sequence object */
1287    );
1288
1289/* Return 1 if substr matches str[start:end] at the given tail end, 0
1290   otherwise. */
1291
1292PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1293    PyObject *str,		/* String */
1294    PyObject *substr,		/* Prefix or Suffix string */
1295    Py_ssize_t start,		/* Start index */
1296    Py_ssize_t end,		/* Stop index */
1297    int direction		/* Tail end: -1 prefix, +1 suffix */
1298    );
1299
1300/* Return the first position of substr in str[start:end] using the
1301   given search direction or -1 if not found. -2 is returned in case
1302   an error occurred and an exception is set. */
1303
1304PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1305    PyObject *str,		/* String */
1306    PyObject *substr,		/* Substring to find */
1307    Py_ssize_t start,		/* Start index */
1308    Py_ssize_t end,		/* Stop index */
1309    int direction		/* Find direction: +1 forward, -1 backward */
1310    );
1311
1312/* Count the number of occurrences of substr in str[start:end]. */
1313
1314PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1315    PyObject *str,		/* String */
1316    PyObject *substr,		/* Substring to count */
1317    Py_ssize_t start,		/* Start index */
1318    Py_ssize_t end		/* Stop index */
1319    );
1320
1321/* Replace at most maxcount occurrences of substr in str with replstr
1322   and return the resulting Unicode object. */
1323
1324PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1325    PyObject *str,		/* String */
1326    PyObject *substr,		/* Substring to find */
1327    PyObject *replstr,		/* Substring to replace */
1328    Py_ssize_t maxcount		/* Max. number of replacements to apply;
1329				   -1 = all */
1330    );
1331
1332/* Compare two strings and return -1, 0, 1 for less than, equal,
1333   greater than resp. */
1334
1335PyAPI_FUNC(int) PyUnicode_Compare(
1336    PyObject *left,		/* Left string */
1337    PyObject *right		/* Right string */
1338    );
1339
1340PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1341    PyObject *left,
1342    const char *right
1343    );
1344
1345/* Rich compare two strings and return one of the following:
1346
1347   - NULL in case an exception was raised
1348   - Py_True or Py_False for successfuly comparisons
1349   - Py_NotImplemented in case the type combination is unknown
1350
1351   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1352   case the conversion of the arguments to Unicode fails with a
1353   UnicodeDecodeError.
1354
1355   Possible values for op:
1356
1357     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1358
1359*/
1360
1361PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1362    PyObject *left,		/* Left string */
1363    PyObject *right,		/* Right string */
1364    int op			/* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1365    );
1366
1367/* Apply a argument tuple or dictionary to a format string and return
1368   the resulting Unicode string. */
1369
1370PyAPI_FUNC(PyObject *) PyUnicode_Format(
1371    PyObject *format,		/* Format string */
1372    PyObject *args		/* Argument tuple or dictionary */
1373    );
1374
1375/* Checks whether element is contained in container and return 1/0
1376   accordingly.
1377
1378   element has to coerce to an one element Unicode string. -1 is
1379   returned in case of an error. */
1380
1381PyAPI_FUNC(int) PyUnicode_Contains(
1382    PyObject *container,	/* Container string */
1383    PyObject *element		/* Element string */
1384    );
1385
1386/* Checks whether argument is a valid identifier. */
1387
1388PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1389
1390/* Externally visible for str.strip(unicode) */
1391PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1392    PyUnicodeObject *self,
1393    int striptype,
1394    PyObject *sepobj
1395    );
1396
1397/* === Characters Type APIs =============================================== */
1398
1399/* These should not be used directly. Use the Py_UNICODE_IS* and
1400   Py_UNICODE_TO* macros instead.
1401
1402   These APIs are implemented in Objects/unicodectype.c.
1403
1404*/
1405
1406PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1407    Py_UNICODE ch 	/* Unicode character */
1408    );
1409
1410PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1411    Py_UNICODE ch 	/* Unicode character */
1412    );
1413
1414PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1415    Py_UNICODE ch 	/* Unicode character */
1416    );
1417
1418PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1419    Py_UNICODE ch 	/* Unicode character */
1420    );
1421
1422PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1423    Py_UNICODE ch 	/* Unicode character */
1424    );
1425
1426PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1427    const Py_UNICODE ch 	/* Unicode character */
1428    );
1429
1430PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1431    const Py_UNICODE ch 	/* Unicode character */
1432    );
1433
1434PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1435    Py_UNICODE ch 	/* Unicode character */
1436    );
1437
1438PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1439    Py_UNICODE ch 	/* Unicode character */
1440    );
1441
1442PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1443    Py_UNICODE ch 	/* Unicode character */
1444    );
1445
1446PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1447    Py_UNICODE ch 	/* Unicode character */
1448    );
1449
1450PyAPI_FUNC(int) _PyUnicode_ToDigit(
1451    Py_UNICODE ch 	/* Unicode character */
1452    );
1453
1454PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1455    Py_UNICODE ch 	/* Unicode character */
1456    );
1457
1458PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1459    Py_UNICODE ch 	/* Unicode character */
1460    );
1461
1462PyAPI_FUNC(int) _PyUnicode_IsDigit(
1463    Py_UNICODE ch 	/* Unicode character */
1464    );
1465
1466PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1467    Py_UNICODE ch 	/* Unicode character */
1468    );
1469
1470PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1471    Py_UNICODE ch 	/* Unicode character */
1472    );
1473
1474PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u);
1475
1476PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1477    Py_UNICODE *s1, const Py_UNICODE *s2);
1478
1479PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1480    Py_UNICODE *s1, const Py_UNICODE *s2, size_t n);
1481
1482PyAPI_FUNC(int) Py_UNICODE_strcmp(
1483    const Py_UNICODE *s1, const Py_UNICODE *s2);
1484
1485PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1486    const Py_UNICODE *s, Py_UNICODE c
1487    );
1488
1489#ifdef __cplusplus
1490}
1491#endif
1492#endif /* !Py_UNICODEOBJECT_H */
1493