unicodeobject.h revision 7ade6485abde95c5cc9676ad3e476ba3aca98037
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10Copyright (c) Corporation for National Research Initiatives.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python.  This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include <ctype.h>
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
62   properly set, but the default rules below doesn't set it.  I'll
63   sort this out some other day -- fredrik@pythonware.com */
64
65#ifndef Py_UNICODE_SIZE
66#error Must define Py_UNICODE_SIZE
67#endif
68
69/* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
70   strings are stored as UCS-2 (with limited support for UTF-16) */
71
72#if Py_UNICODE_SIZE >= 4
73#define Py_UNICODE_WIDE
74#endif
75
76/* Set these flags if the platform has "wchar.h", "wctype.h" and the
77   wchar_t type is a 16-bit unsigned type */
78/* #define HAVE_WCHAR_H */
79/* #define HAVE_USABLE_WCHAR_T */
80
81/* Defaults for various platforms */
82#ifndef PY_UNICODE_TYPE
83
84/* Windows has a usable wchar_t type (unless we're using UCS-4) */
85# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
86#  define HAVE_USABLE_WCHAR_T
87#  define PY_UNICODE_TYPE wchar_t
88# endif
89
90# if defined(Py_UNICODE_WIDE)
91#  define PY_UNICODE_TYPE Py_UCS4
92# endif
93
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97   through the interface functions PyUnicode_FromWideChar() and
98   PyUnicode_AsWideChar(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
101# ifndef HAVE_WCHAR_H
102#  define HAVE_WCHAR_H
103# endif
104#endif
105
106#ifdef HAVE_WCHAR_H
107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
108# ifdef _HAVE_BSDI
109#  include <time.h>
110# endif
111#  include <wchar.h>
112#endif
113
114/*
115 * Use this typedef when you need to represent a UTF-16 surrogate pair
116 * as single unsigned integer.
117 */
118#if SIZEOF_INT >= 4
119typedef unsigned int Py_UCS4;
120#elif SIZEOF_LONG >= 4
121typedef unsigned long Py_UCS4;
122#endif
123
124typedef PY_UNICODE_TYPE Py_UNICODE;
125
126/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
127
128/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
129   produce different external names and thus cause import errors in
130   case Python interpreters and extensions with mixed compiled in
131   Unicode width assumptions are combined. */
132
133#ifndef Py_UNICODE_WIDE
134
135# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
136# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
137# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
138# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
139# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
140# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
141# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
142# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
143# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
144# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
145# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
146# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
147# define PyUnicode_Compare PyUnicodeUCS2_Compare
148# define PyUnicode_Concat PyUnicodeUCS2_Concat
149# define PyUnicode_Append PyUnicodeUCS2_Append
150# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
151# define PyUnicode_Contains PyUnicodeUCS2_Contains
152# define PyUnicode_Count PyUnicodeUCS2_Count
153# define PyUnicode_Decode PyUnicodeUCS2_Decode
154# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
155# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
156# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
157# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
158# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
159# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
160# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
161# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
162# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
163# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
164# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
165# define PyUnicode_Encode PyUnicodeUCS2_Encode
166# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
167# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
168# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
169# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
170# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
171# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
172# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
173# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
174# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
175# define PyUnicode_Find PyUnicodeUCS2_Find
176# define PyUnicode_Format PyUnicodeUCS2_Format
177# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
178# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
179# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
180# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
181# define PyUnicode_FromString PyUnicodeUCS2_FromString
182# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
183# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
184# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
185# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
186# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
187# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
188# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
189# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
190# define PyUnicode_Join PyUnicodeUCS2_Join
191# define PyUnicode_Partition PyUnicodeUCS2_Partition
192# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
193# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
194# define PyUnicode_Replace PyUnicodeUCS2_Replace
195# define PyUnicode_Resize PyUnicodeUCS2_Resize
196# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
197# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
198# define PyUnicode_Split PyUnicodeUCS2_Split
199# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
200# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
201# define PyUnicode_Translate PyUnicodeUCS2_Translate
202# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
203# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
204# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
205# define _PyUnicode_Init _PyUnicodeUCS2_Init
206# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
207# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
208# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
209# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
210# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
211# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
212# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
213# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
214# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
215# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
216# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
217# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
218# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
219# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
220# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
221# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
222# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
223
224#else
225
226# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
227# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
228# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
229# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
230# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
231# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
232# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
233# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
234# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
235# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
236# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
237# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
238# define PyUnicode_Compare PyUnicodeUCS4_Compare
239# define PyUnicode_Concat PyUnicodeUCS4_Concat
240# define PyUnicode_Append PyUnicodeUCS4_Append
241# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
242# define PyUnicode_Contains PyUnicodeUCS4_Contains
243# define PyUnicode_Count PyUnicodeUCS4_Count
244# define PyUnicode_Decode PyUnicodeUCS4_Decode
245# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
246# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
247# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
248# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
249# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
250# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
251# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
252# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
253# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
254# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
255# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
256# define PyUnicode_Encode PyUnicodeUCS4_Encode
257# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
258# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
259# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
260# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
261# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
262# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
263# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
264# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
265# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
266# define PyUnicode_Find PyUnicodeUCS4_Find
267# define PyUnicode_Format PyUnicodeUCS4_Format
268# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
269# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
270# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
271# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
272# define PyUnicode_FromString PyUnicodeUCS4_FromString
273# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
274# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
275# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
276# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
277# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
278# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
279# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
280# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
281# define PyUnicode_Join PyUnicodeUCS4_Join
282# define PyUnicode_Partition PyUnicodeUCS4_Partition
283# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
284# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
285# define PyUnicode_Replace PyUnicodeUCS4_Replace
286# define PyUnicode_Resize PyUnicodeUCS4_Resize
287# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
288# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
289# define PyUnicode_Split PyUnicodeUCS4_Split
290# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
291# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
292# define PyUnicode_Translate PyUnicodeUCS4_Translate
293# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
294# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
295# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
296# define _PyUnicode_Init _PyUnicodeUCS4_Init
297# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
298# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
299# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
300# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
301# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
302# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
303# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
304# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
305# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
306# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
307# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
308# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
309# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
310# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
311# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
312# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
313# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
314
315
316#endif
317
318/* --- Internal Unicode Operations ---------------------------------------- */
319
320/* If you want Python to use the compiler's wctype.h functions instead
321   of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
322   configure Python using --with-wctype-functions.  This reduces the
323   interpreter's code size. */
324
325#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
326
327#include <wctype.h>
328
329#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
330
331#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
332#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
333#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
334#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
335
336#define Py_UNICODE_TOLOWER(ch) towlower(ch)
337#define Py_UNICODE_TOUPPER(ch) towupper(ch)
338#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
339
340#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
341#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
342#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
343
344#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
345#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
346#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
347
348#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
349
350#else
351
352#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
353
354#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
355#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
356#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
357#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
358
359#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
360#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
361#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
362
363#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
364#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
365#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
366
367#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
368#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
369#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
370
371#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
372
373#endif
374
375#define Py_UNICODE_ISALNUM(ch) \
376       (Py_UNICODE_ISALPHA(ch) || \
377        Py_UNICODE_ISDECIMAL(ch) || \
378        Py_UNICODE_ISDIGIT(ch) || \
379        Py_UNICODE_ISNUMERIC(ch))
380
381#define Py_UNICODE_COPY(target, source, length)				\
382	Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
383
384#define Py_UNICODE_FILL(target, value, length) do\
385    {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
386        for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
387    } while (0)
388
389/* check if substring matches at given offset.  the offset must be
390   valid, and the substring must not be empty */
391#define Py_UNICODE_MATCH(string, offset, substring) \
392    ((*((string)->str + (offset)) == *((substring)->str)) && \
393    ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
394     !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
395
396#ifdef __cplusplus
397extern "C" {
398#endif
399
400/* --- Unicode Type ------------------------------------------------------- */
401
402typedef struct {
403    PyObject_HEAD
404    Py_ssize_t length;		/* Length of raw Unicode data in buffer */
405    Py_UNICODE *str;		/* Raw Unicode buffer */
406    long hash;			/* Hash value; -1 if not set */
407    int state;			/* != 0 if interned. In this case the two
408    				 * references from the dictionary to this object
409    				 * are *not* counted in ob_refcnt. */
410    PyObject *defenc;		/* (Default) Encoded version as Python
411				   string, or NULL; this is used for
412				   implementing the buffer protocol */
413} PyUnicodeObject;
414
415PyAPI_DATA(PyTypeObject) PyUnicode_Type;
416
417#define SSTATE_NOT_INTERNED 0
418#define SSTATE_INTERNED_MORTAL 1
419#define SSTATE_INTERNED_IMMORTAL 2
420
421#define PyUnicode_Check(op) \
422                 PyType_FastSubclass(Py_Type(op), Py_TPFLAGS_UNICODE_SUBCLASS)
423#define PyUnicode_CheckExact(op) (Py_Type(op) == &PyUnicode_Type)
424
425/* Fast access macros */
426#define PyUnicode_GET_SIZE(op) \
427        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
428#define PyUnicode_GET_DATA_SIZE(op) \
429        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
430#define PyUnicode_AS_UNICODE(op) \
431        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
432#define PyUnicode_AS_DATA(op) \
433        (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
434
435/* --- Constants ---------------------------------------------------------- */
436
437/* This Unicode character will be used as replacement character during
438   decoding if the errors argument is set to "replace". Note: the
439   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
440   Unicode 3.0. */
441
442#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
443
444/* === Public API ========================================================= */
445
446/* --- Plain Py_UNICODE --------------------------------------------------- */
447
448/* Create a Unicode Object from the Py_UNICODE buffer u of the given
449   size.
450
451   u may be NULL which causes the contents to be undefined. It is the
452   user's responsibility to fill in the needed data afterwards. Note
453   that modifying the Unicode object contents after construction is
454   only allowed if u was set to NULL.
455
456   The buffer is copied into the new object. */
457
458PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
459    const Py_UNICODE *u,        /* Unicode buffer */
460    Py_ssize_t size             /* size of buffer */
461    );
462
463/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
464PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
465    const char *u,        /* char buffer */
466    Py_ssize_t size       /* size of buffer */
467    );
468
469/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
470   Latin-1 encoded bytes */
471PyAPI_FUNC(PyObject*) PyUnicode_FromString(
472    const char *u        /* string */
473    );
474
475/* Return a read-only pointer to the Unicode object's internal
476   Py_UNICODE buffer. */
477
478PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
479    PyObject *unicode	 	/* Unicode object */
480    );
481
482/* Get the length of the Unicode object. */
483
484PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
485    PyObject *unicode	 	/* Unicode object */
486    );
487
488/* Get the maximum ordinal for a Unicode character. */
489PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
490
491/* Resize an already allocated Unicode object to the new size length.
492
493   *unicode is modified to point to the new (resized) object and 0
494   returned on success.
495
496   This API may only be called by the function which also called the
497   Unicode constructor. The refcount on the object must be 1. Otherwise,
498   an error is returned.
499
500   Error handling is implemented as follows: an exception is set, -1
501   is returned and *unicode left untouched.
502
503*/
504
505PyAPI_FUNC(int) PyUnicode_Resize(
506    PyObject **unicode,		/* Pointer to the Unicode object */
507    Py_ssize_t length		/* New length */
508    );
509
510/* Coerce obj to an Unicode object and return a reference with
511   *incremented* refcount.
512
513   Coercion is done in the following way:
514
515   1. String and other char buffer compatible objects are decoded
516      under the assumptions that they contain data using the current
517      default encoding. Decoding is done in "strict" mode.
518
519   2. All other objects (including Unicode objects) raise an
520      exception.
521
522   The API returns NULL in case of an error. The caller is responsible
523   for decref'ing the returned objects.
524
525*/
526
527PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
528    register PyObject *obj, 	/* Object */
529    const char *encoding,       /* encoding */
530    const char *errors          /* error handling */
531    );
532
533/* Coerce obj to an Unicode object and return a reference with
534   *incremented* refcount.
535
536   Unicode objects are passed back as-is (subclasses are converted to
537   true Unicode objects), all other objects are delegated to
538   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
539   using the default encoding as basis for decoding the object.
540
541   The API returns NULL in case of an error. The caller is responsible
542   for decref'ing the returned objects.
543
544*/
545
546PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
547    register PyObject *obj 	/* Object */
548    );
549
550PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
551PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
552
553PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
554PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
555PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
556PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
557
558/* Use only if you know it's a string */
559#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
560
561/* --- wchar_t support for platforms which support it --------------------- */
562
563#ifdef HAVE_WCHAR_H
564
565/* Create a Unicode Object from the whcar_t buffer w of the given
566   size.
567
568   The buffer is copied into the new object. */
569
570PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
571    register const wchar_t *w,  /* wchar_t buffer */
572    Py_ssize_t size             /* size of buffer */
573    );
574
575/* Copies the Unicode Object contents into the wchar_t buffer w.  At
576   most size wchar_t characters are copied.
577
578   Note that the resulting wchar_t string may or may not be
579   0-terminated.  It is the responsibility of the caller to make sure
580   that the wchar_t string is 0-terminated in case this is required by
581   the application.
582
583   Returns the number of wchar_t characters copied (excluding a
584   possibly trailing 0-termination character) or -1 in case of an
585   error. */
586
587PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
588    PyUnicodeObject *unicode,   /* Unicode object */
589    register wchar_t *w,        /* wchar_t buffer */
590    Py_ssize_t size             /* size of buffer */
591    );
592
593#endif
594
595/* --- Unicode ordinals --------------------------------------------------- */
596
597/* Create a Unicode Object from the given Unicode code point ordinal.
598
599   The ordinal must be in range(0x10000) on narrow Python builds
600   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
601   raised in case it is not.
602
603*/
604
605PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
606
607/* === Builtin Codecs =====================================================
608
609   Many of these APIs take two arguments encoding and errors. These
610   parameters encoding and errors have the same semantics as the ones
611   of the builtin unicode() API.
612
613   Setting encoding to NULL causes the default encoding to be used.
614
615   Error handling is set by errors which may also be set to NULL
616   meaning to use the default handling defined for the codec. Default
617   error handling for all builtin codecs is "strict" (ValueErrors are
618   raised).
619
620   The codecs all use a similar interface. Only deviation from the
621   generic ones are documented.
622
623*/
624
625/* --- Manage the default encoding ---------------------------------------- */
626
627/* Return a Python string holding the default encoded value of the
628   Unicode object.
629
630   The resulting string is cached in the Unicode object for subsequent
631   usage by this function. The cached version is needed to implement
632   the character buffer interface and will live (at least) as long as
633   the Unicode object itself.
634
635   The refcount of the string is *not* incremented.
636
637   *** Exported for internal use by the interpreter only !!! ***
638
639*/
640
641PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
642    PyObject *, const char *);
643
644/* Return a char* holding the default encoded value of the
645   Unicode object.
646*/
647
648PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*);
649
650
651/* Returns the currently active default encoding.
652
653   The default encoding is currently implemented as run-time settable
654   process global.  This may change in future versions of the
655   interpreter to become a parameter which is managed on a per-thread
656   basis.
657
658 */
659
660PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
661
662/* Sets the currently active default encoding.
663
664   Returns 0 on success, -1 in case of an error.
665
666 */
667
668PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
669    const char *encoding	/* Encoding name in standard form */
670    );
671
672/* --- Generic Codecs ----------------------------------------------------- */
673
674/* Create a Unicode object by decoding the encoded string s of the
675   given size. */
676
677PyAPI_FUNC(PyObject*) PyUnicode_Decode(
678    const char *s,              /* encoded string */
679    Py_ssize_t size,            /* size of buffer */
680    const char *encoding,       /* encoding */
681    const char *errors          /* error handling */
682    );
683
684/* Encodes a Py_UNICODE buffer of the given size and returns a
685   Python string object. */
686
687PyAPI_FUNC(PyObject*) PyUnicode_Encode(
688    const Py_UNICODE *s,        /* Unicode char buffer */
689    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
690    const char *encoding,       /* encoding */
691    const char *errors          /* error handling */
692    );
693
694/* Encodes a Unicode object and returns the result as Python
695   object. */
696
697PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
698    PyObject *unicode,	 	/* Unicode object */
699    const char *encoding,	/* encoding */
700    const char *errors		/* error handling */
701    );
702
703/* Encodes a Unicode object and returns the result as Python string
704   object. */
705
706PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
707    PyObject *unicode,	 	/* Unicode object */
708    const char *encoding,	/* encoding */
709    const char *errors		/* error handling */
710    );
711
712PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
713    PyObject* string            /* 256 character map */
714   );
715
716
717/* --- UTF-7 Codecs ------------------------------------------------------- */
718
719PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
720    const char *string, 	/* UTF-7 encoded string */
721    Py_ssize_t length,	 	/* size of string */
722    const char *errors		/* error handling */
723    );
724
725PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
726    const Py_UNICODE *data, 	/* Unicode char buffer */
727    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
728    int encodeSetO,             /* force the encoder to encode characters in
729                                   Set O, as described in RFC2152 */
730    int encodeWhiteSpace,       /* force the encoder to encode space, tab,
731                                   carriage return and linefeed characters */
732    const char *errors		/* error handling */
733    );
734
735/* --- UTF-8 Codecs ------------------------------------------------------- */
736
737PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
738    const char *string, 	/* UTF-8 encoded string */
739    Py_ssize_t length,	 	/* size of string */
740    const char *errors		/* error handling */
741    );
742
743PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
744    const char *string, 	/* UTF-8 encoded string */
745    Py_ssize_t length,	 	/* size of string */
746    const char *errors,		/* error handling */
747    Py_ssize_t *consumed	/* bytes consumed */
748    );
749
750PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
751    PyObject *unicode	 	/* Unicode object */
752    );
753
754PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
755    const Py_UNICODE *data, 	/* Unicode char buffer */
756    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
757    const char *errors		/* error handling */
758    );
759
760/* --- UTF-32 Codecs ------------------------------------------------------ */
761
762/* Decodes length bytes from a UTF-32 encoded buffer string and returns
763   the corresponding Unicode object.
764
765   errors (if non-NULL) defines the error handling. It defaults
766   to "strict".
767
768   If byteorder is non-NULL, the decoder starts decoding using the
769   given byte order:
770
771	*byteorder == -1: little endian
772	*byteorder == 0:  native order
773	*byteorder == 1:  big endian
774
775   In native mode, the first four bytes of the stream are checked for a
776   BOM mark. If found, the BOM mark is analysed, the byte order
777   adjusted and the BOM skipped.  In the other modes, no BOM mark
778   interpretation is done. After completion, *byteorder is set to the
779   current byte order at the end of input data.
780
781   If byteorder is NULL, the codec starts in native order mode.
782
783*/
784
785PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
786    const char *string, 	/* UTF-32 encoded string */
787    Py_ssize_t length,	 	/* size of string */
788    const char *errors,		/* error handling */
789    int *byteorder		/* pointer to byteorder to use
790				   0=native;-1=LE,1=BE; updated on
791				   exit */
792    );
793
794PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
795    const char *string, 	/* UTF-32 encoded string */
796    Py_ssize_t length,	 	/* size of string */
797    const char *errors,		/* error handling */
798    int *byteorder,		/* pointer to byteorder to use
799				   0=native;-1=LE,1=BE; updated on
800				   exit */
801    Py_ssize_t *consumed	/* bytes consumed */
802    );
803
804/* Returns a Python string using the UTF-32 encoding in native byte
805   order. The string always starts with a BOM mark.  */
806
807PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
808    PyObject *unicode	 	/* Unicode object */
809    );
810
811/* Returns a Python string object holding the UTF-32 encoded value of
812   the Unicode data.
813
814   If byteorder is not 0, output is written according to the following
815   byte order:
816
817   byteorder == -1: little endian
818   byteorder == 0:  native byte order (writes a BOM mark)
819   byteorder == 1:  big endian
820
821   If byteorder is 0, the output string will always start with the
822   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
823   prepended.
824
825*/
826
827PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
828    const Py_UNICODE *data, 	/* Unicode char buffer */
829    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
830    const char *errors,		/* error handling */
831    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
832    );
833
834/* --- UTF-16 Codecs ------------------------------------------------------ */
835
836/* Decodes length bytes from a UTF-16 encoded buffer string and returns
837   the corresponding Unicode object.
838
839   errors (if non-NULL) defines the error handling. It defaults
840   to "strict".
841
842   If byteorder is non-NULL, the decoder starts decoding using the
843   given byte order:
844
845	*byteorder == -1: little endian
846	*byteorder == 0:  native order
847	*byteorder == 1:  big endian
848
849   In native mode, the first two bytes of the stream are checked for a
850   BOM mark. If found, the BOM mark is analysed, the byte order
851   adjusted and the BOM skipped.  In the other modes, no BOM mark
852   interpretation is done. After completion, *byteorder is set to the
853   current byte order at the end of input data.
854
855   If byteorder is NULL, the codec starts in native order mode.
856
857*/
858
859PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
860    const char *string, 	/* UTF-16 encoded string */
861    Py_ssize_t length,	 	/* size of string */
862    const char *errors,		/* error handling */
863    int *byteorder		/* pointer to byteorder to use
864				   0=native;-1=LE,1=BE; updated on
865				   exit */
866    );
867
868PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
869    const char *string, 	/* UTF-16 encoded string */
870    Py_ssize_t length,	 	/* size of string */
871    const char *errors,		/* error handling */
872    int *byteorder,		/* pointer to byteorder to use
873				   0=native;-1=LE,1=BE; updated on
874				   exit */
875    Py_ssize_t *consumed	/* bytes consumed */
876    );
877
878/* Returns a Python string using the UTF-16 encoding in native byte
879   order. The string always starts with a BOM mark.  */
880
881PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
882    PyObject *unicode	 	/* Unicode object */
883    );
884
885/* Returns a Python string object holding the UTF-16 encoded value of
886   the Unicode data.
887
888   If byteorder is not 0, output is written according to the following
889   byte order:
890
891   byteorder == -1: little endian
892   byteorder == 0:  native byte order (writes a BOM mark)
893   byteorder == 1:  big endian
894
895   If byteorder is 0, the output string will always start with the
896   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
897   prepended.
898
899   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
900   UCS-2. This trick makes it possible to add full UTF-16 capabilities
901   at a later point without compromising the APIs.
902
903*/
904
905PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
906    const Py_UNICODE *data, 	/* Unicode char buffer */
907    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
908    const char *errors,		/* error handling */
909    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
910    );
911
912/* --- Unicode-Escape Codecs ---------------------------------------------- */
913
914PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
915    const char *string, 	/* Unicode-Escape encoded string */
916    Py_ssize_t length,	 	/* size of string */
917    const char *errors		/* error handling */
918    );
919
920PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
921    PyObject *unicode	 	/* Unicode object */
922    );
923
924PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
925    const Py_UNICODE *data, 	/* Unicode char buffer */
926    Py_ssize_t length	 	/* Number of Py_UNICODE chars to encode */
927    );
928
929/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
930
931PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
932    const char *string, 	/* Raw-Unicode-Escape encoded string */
933    Py_ssize_t length,	 	/* size of string */
934    const char *errors		/* error handling */
935    );
936
937PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
938    PyObject *unicode	 	/* Unicode object */
939    );
940
941PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
942    const Py_UNICODE *data, 	/* Unicode char buffer */
943    Py_ssize_t length	 	/* Number of Py_UNICODE chars to encode */
944    );
945
946/* --- Unicode Internal Codec ---------------------------------------------
947
948    Only for internal use in _codecsmodule.c */
949
950PyObject *_PyUnicode_DecodeUnicodeInternal(
951    const char *string,
952    Py_ssize_t length,
953    const char *errors
954    );
955
956/* --- Latin-1 Codecs -----------------------------------------------------
957
958   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
959
960*/
961
962PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
963    const char *string, 	/* Latin-1 encoded string */
964    Py_ssize_t length,	 	/* size of string */
965    const char *errors		/* error handling */
966    );
967
968PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
969    PyObject *unicode	 	/* Unicode object */
970    );
971
972PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
973    const Py_UNICODE *data, 	/* Unicode char buffer */
974    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
975    const char *errors		/* error handling */
976    );
977
978/* --- ASCII Codecs -------------------------------------------------------
979
980   Only 7-bit ASCII data is excepted. All other codes generate errors.
981
982*/
983
984PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
985    const char *string, 	/* ASCII encoded string */
986    Py_ssize_t length,	 	/* size of string */
987    const char *errors		/* error handling */
988    );
989
990PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
991    PyObject *unicode	 	/* Unicode object */
992    );
993
994PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
995    const Py_UNICODE *data, 	/* Unicode char buffer */
996    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
997    const char *errors		/* error handling */
998    );
999
1000/* --- Character Map Codecs -----------------------------------------------
1001
1002   This codec uses mappings to encode and decode characters.
1003
1004   Decoding mappings must map single string characters to single
1005   Unicode characters, integers (which are then interpreted as Unicode
1006   ordinals) or None (meaning "undefined mapping" and causing an
1007   error).
1008
1009   Encoding mappings must map single Unicode characters to single
1010   string characters, integers (which are then interpreted as Latin-1
1011   ordinals) or None (meaning "undefined mapping" and causing an
1012   error).
1013
1014   If a character lookup fails with a LookupError, the character is
1015   copied as-is meaning that its ordinal value will be interpreted as
1016   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1017   to contain those mappings which map characters to different code
1018   points.
1019
1020*/
1021
1022PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1023    const char *string, 	/* Encoded string */
1024    Py_ssize_t length,	 	/* size of string */
1025    PyObject *mapping,		/* character mapping
1026				   (char ordinal -> unicode ordinal) */
1027    const char *errors		/* error handling */
1028    );
1029
1030PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1031    PyObject *unicode,	 	/* Unicode object */
1032    PyObject *mapping		/* character mapping
1033				   (unicode ordinal -> char ordinal) */
1034    );
1035
1036PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1037    const Py_UNICODE *data, 	/* Unicode char buffer */
1038    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1039    PyObject *mapping,		/* character mapping
1040				   (unicode ordinal -> char ordinal) */
1041    const char *errors		/* error handling */
1042    );
1043
1044/* Translate a Py_UNICODE buffer of the given length by applying a
1045   character mapping table to it and return the resulting Unicode
1046   object.
1047
1048   The mapping table must map Unicode ordinal integers to Unicode
1049   ordinal integers or None (causing deletion of the character).
1050
1051   Mapping tables may be dictionaries or sequences. Unmapped character
1052   ordinals (ones which cause a LookupError) are left untouched and
1053   are copied as-is.
1054
1055*/
1056
1057PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1058    const Py_UNICODE *data, 	/* Unicode char buffer */
1059    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1060    PyObject *table,		/* Translate table */
1061    const char *errors		/* error handling */
1062    );
1063
1064#ifdef MS_WIN32
1065
1066/* --- MBCS codecs for Windows -------------------------------------------- */
1067
1068PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1069    const char *string,         /* MBCS encoded string */
1070    Py_ssize_t length,              /* size of string */
1071    const char *errors          /* error handling */
1072    );
1073
1074PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1075    const char *string,         /* MBCS encoded string */
1076    Py_ssize_t length,          /* size of string */
1077    const char *errors,         /* error handling */
1078    Py_ssize_t *consumed        /* bytes consumed */
1079    );
1080
1081PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1082    PyObject *unicode           /* Unicode object */
1083    );
1084
1085PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1086    const Py_UNICODE *data,     /* Unicode char buffer */
1087    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1088    const char *errors          /* error handling */
1089    );
1090
1091#endif /* MS_WIN32 */
1092
1093/* --- Decimal Encoder ---------------------------------------------------- */
1094
1095/* Takes a Unicode string holding a decimal value and writes it into
1096   an output buffer using standard ASCII digit codes.
1097
1098   The output buffer has to provide at least length+1 bytes of storage
1099   area. The output string is 0-terminated.
1100
1101   The encoder converts whitespace to ' ', decimal characters to their
1102   corresponding ASCII digit and all other Latin-1 characters except
1103   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1104   are treated as errors. This includes embedded NULL bytes.
1105
1106   Error handling is defined by the errors argument:
1107
1108      NULL or "strict": raise a ValueError
1109      "ignore": ignore the wrong characters (these are not copied to the
1110		output buffer)
1111      "replace": replaces illegal characters with '?'
1112
1113   Returns 0 on success, -1 on failure.
1114
1115*/
1116
1117PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1118    Py_UNICODE *s,		/* Unicode buffer */
1119    Py_ssize_t length,		/* Number of Py_UNICODE chars to encode */
1120    char *output,		/* Output buffer; must have size >= length */
1121    const char *errors		/* error handling */
1122    );
1123
1124/* --- Methods & Slots ----------------------------------------------------
1125
1126   These are capable of handling Unicode objects and strings on input
1127   (we refer to them as strings in the descriptions) and return
1128   Unicode objects or integers as apporpriate. */
1129
1130/* Concat two strings giving a new Unicode string. */
1131
1132PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1133    PyObject *left,	 	/* Left string */
1134    PyObject *right	 	/* Right string */
1135    );
1136
1137/* Concat two strings and put the result in *pleft
1138   (sets *pleft to NULL on error) */
1139
1140PyAPI_FUNC(void) PyUnicode_Append(
1141    PyObject **pleft,	 	/* Pointer to left string */
1142    PyObject *right	 	/* Right string */
1143    );
1144
1145/* Concat two strings, put the result in *pleft and drop the right object
1146   (sets *pleft to NULL on error) */
1147
1148PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1149    PyObject **pleft,	 	/* Pointer to left string */
1150    PyObject *right	 	/* Right string */
1151    );
1152
1153/* Split a string giving a list of Unicode strings.
1154
1155   If sep is NULL, splitting will be done at all whitespace
1156   substrings. Otherwise, splits occur at the given separator.
1157
1158   At most maxsplit splits will be done. If negative, no limit is set.
1159
1160   Separators are not included in the resulting list.
1161
1162*/
1163
1164PyAPI_FUNC(PyObject*) PyUnicode_Split(
1165    PyObject *s,		/* String to split */
1166    PyObject *sep,		/* String separator */
1167    Py_ssize_t maxsplit		/* Maxsplit count */
1168    );
1169
1170/* Dito, but split at line breaks.
1171
1172   CRLF is considered to be one line break. Line breaks are not
1173   included in the resulting list. */
1174
1175PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1176    PyObject *s,		/* String to split */
1177    int keepends		/* If true, line end markers are included */
1178    );
1179
1180/* Partition a string using a given separator. */
1181
1182PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1183    PyObject *s,		/* String to partition */
1184    PyObject *sep		/* String separator */
1185    );
1186
1187/* Partition a string using a given separator, searching from the end of the
1188   string. */
1189
1190PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1191    PyObject *s,		/* String to partition */
1192    PyObject *sep		/* String separator */
1193    );
1194
1195/* Split a string giving a list of Unicode strings.
1196
1197   If sep is NULL, splitting will be done at all whitespace
1198   substrings. Otherwise, splits occur at the given separator.
1199
1200   At most maxsplit splits will be done. But unlike PyUnicode_Split
1201   PyUnicode_RSplit splits from the end of the string. If negative,
1202   no limit is set.
1203
1204   Separators are not included in the resulting list.
1205
1206*/
1207
1208PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1209    PyObject *s,		/* String to split */
1210    PyObject *sep,		/* String separator */
1211    Py_ssize_t maxsplit		/* Maxsplit count */
1212    );
1213
1214/* Translate a string by applying a character mapping table to it and
1215   return the resulting Unicode object.
1216
1217   The mapping table must map Unicode ordinal integers to Unicode
1218   ordinal integers or None (causing deletion of the character).
1219
1220   Mapping tables may be dictionaries or sequences. Unmapped character
1221   ordinals (ones which cause a LookupError) are left untouched and
1222   are copied as-is.
1223
1224*/
1225
1226PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1227    PyObject *str,		/* String */
1228    PyObject *table,		/* Translate table */
1229    const char *errors		/* error handling */
1230    );
1231
1232/* Join a sequence of strings using the given separator and return
1233   the resulting Unicode string. */
1234
1235PyAPI_FUNC(PyObject*) PyUnicode_Join(
1236    PyObject *separator, 	/* Separator string */
1237    PyObject *seq	 	/* Sequence object */
1238    );
1239
1240/* Return 1 if substr matches str[start:end] at the given tail end, 0
1241   otherwise. */
1242
1243PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1244    PyObject *str,		/* String */
1245    PyObject *substr,		/* Prefix or Suffix string */
1246    Py_ssize_t start,		/* Start index */
1247    Py_ssize_t end,		/* Stop index */
1248    int direction		/* Tail end: -1 prefix, +1 suffix */
1249    );
1250
1251/* Return the first position of substr in str[start:end] using the
1252   given search direction or -1 if not found. -2 is returned in case
1253   an error occurred and an exception is set. */
1254
1255PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1256    PyObject *str,		/* String */
1257    PyObject *substr,		/* Substring to find */
1258    Py_ssize_t start,		/* Start index */
1259    Py_ssize_t end,		/* Stop index */
1260    int direction		/* Find direction: +1 forward, -1 backward */
1261    );
1262
1263/* Count the number of occurrences of substr in str[start:end]. */
1264
1265PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1266    PyObject *str,		/* String */
1267    PyObject *substr,		/* Substring to count */
1268    Py_ssize_t start,		/* Start index */
1269    Py_ssize_t end		/* Stop index */
1270    );
1271
1272/* Replace at most maxcount occurrences of substr in str with replstr
1273   and return the resulting Unicode object. */
1274
1275PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1276    PyObject *str,		/* String */
1277    PyObject *substr,		/* Substring to find */
1278    PyObject *replstr,		/* Substring to replace */
1279    Py_ssize_t maxcount		/* Max. number of replacements to apply;
1280				   -1 = all */
1281    );
1282
1283/* Compare two strings and return -1, 0, 1 for less than, equal,
1284   greater than resp. */
1285
1286PyAPI_FUNC(int) PyUnicode_Compare(
1287    PyObject *left,		/* Left string */
1288    PyObject *right		/* Right string */
1289    );
1290
1291PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1292    PyObject *left,
1293    const char *right
1294    );
1295
1296/* Rich compare two strings and return one of the following:
1297
1298   - NULL in case an exception was raised
1299   - Py_True or Py_False for successfuly comparisons
1300   - Py_NotImplemented in case the type combination is unknown
1301
1302   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1303   case the conversion of the arguments to Unicode fails with a
1304   UnicodeDecodeError.
1305
1306   Possible values for op:
1307
1308     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1309
1310*/
1311
1312PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1313    PyObject *left,		/* Left string */
1314    PyObject *right,		/* Right string */
1315    int op			/* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1316    );
1317
1318/* Apply a argument tuple or dictionary to a format string and return
1319   the resulting Unicode string. */
1320
1321PyAPI_FUNC(PyObject *) PyUnicode_Format(
1322    PyObject *format,		/* Format string */
1323    PyObject *args		/* Argument tuple or dictionary */
1324    );
1325
1326/* Checks whether element is contained in container and return 1/0
1327   accordingly.
1328
1329   element has to coerce to an one element Unicode string. -1 is
1330   returned in case of an error. */
1331
1332PyAPI_FUNC(int) PyUnicode_Contains(
1333    PyObject *container,	/* Container string */
1334    PyObject *element		/* Element string */
1335    );
1336
1337/* Checks whether argument is a valid identifier. */
1338
1339PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1340
1341/* Externally visible for str.strip(unicode) */
1342PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1343    PyUnicodeObject *self,
1344    int striptype,
1345    PyObject *sepobj
1346    );
1347
1348/* === Characters Type APIs =============================================== */
1349
1350/* These should not be used directly. Use the Py_UNICODE_IS* and
1351   Py_UNICODE_TO* macros instead.
1352
1353   These APIs are implemented in Objects/unicodectype.c.
1354
1355*/
1356
1357PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1358    Py_UNICODE ch 	/* Unicode character */
1359    );
1360
1361PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1362    Py_UNICODE ch 	/* Unicode character */
1363    );
1364
1365PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1366    Py_UNICODE ch 	/* Unicode character */
1367    );
1368
1369PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1370    Py_UNICODE ch 	/* Unicode character */
1371    );
1372
1373PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1374    Py_UNICODE ch 	/* Unicode character */
1375    );
1376
1377PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1378    const Py_UNICODE ch 	/* Unicode character */
1379    );
1380
1381PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1382    const Py_UNICODE ch 	/* Unicode character */
1383    );
1384
1385PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1386    Py_UNICODE ch 	/* Unicode character */
1387    );
1388
1389PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1390    Py_UNICODE ch 	/* Unicode character */
1391    );
1392
1393PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1394    Py_UNICODE ch 	/* Unicode character */
1395    );
1396
1397PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1398    Py_UNICODE ch 	/* Unicode character */
1399    );
1400
1401PyAPI_FUNC(int) _PyUnicode_ToDigit(
1402    Py_UNICODE ch 	/* Unicode character */
1403    );
1404
1405PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1406    Py_UNICODE ch 	/* Unicode character */
1407    );
1408
1409PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1410    Py_UNICODE ch 	/* Unicode character */
1411    );
1412
1413PyAPI_FUNC(int) _PyUnicode_IsDigit(
1414    Py_UNICODE ch 	/* Unicode character */
1415    );
1416
1417PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1418    Py_UNICODE ch 	/* Unicode character */
1419    );
1420
1421PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1422    Py_UNICODE ch 	/* Unicode character */
1423    );
1424
1425PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u);
1426
1427PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1428    Py_UNICODE *s1, const Py_UNICODE *s2);
1429
1430PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1431    Py_UNICODE *s1, const Py_UNICODE *s2, size_t n);
1432
1433PyAPI_FUNC(int) Py_UNICODE_strcmp(
1434    const Py_UNICODE *s1, const Py_UNICODE *s2);
1435
1436PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1437    const Py_UNICODE *s, Py_UNICODE c
1438    );
1439
1440PyObject *
1441_unicodeformatter_iterator(PyObject *str);
1442PyObject *
1443_unicodeformatter_field_name_split(PyObject *field_name);
1444
1445#ifdef __cplusplus
1446}
1447#endif
1448#endif /* !Py_UNICODEOBJECT_H */
1449