unicodeobject.h revision a156e09b19cd239176a9316ddca7641784eea99e
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal (see file Misc/unicode.txt).
11
12Copyright (c) Corporation for National Research Initiatives.
13
14
15 Original header:
16 --------------------------------------------------------------------
17
18 * Yet another Unicode string type for Python.  This type supports the
19 * 16-bit Basic Multilingual Plane (BMP) only.
20 *
21 * Written by Fredrik Lundh, January 1999.
22 *
23 * Copyright (c) 1999 by Secret Labs AB.
24 * Copyright (c) 1999 by Fredrik Lundh.
25 *
26 * fredrik@pythonware.com
27 * http://www.pythonware.com
28 *
29 * --------------------------------------------------------------------
30 * This Unicode String Type is
31 *
32 * Copyright (c) 1999 by Secret Labs AB
33 * Copyright (c) 1999 by Fredrik Lundh
34 *
35 * By obtaining, using, and/or copying this software and/or its
36 * associated documentation, you agree that you have read, understood,
37 * and will comply with the following terms and conditions:
38 *
39 * Permission to use, copy, modify, and distribute this software and its
40 * associated documentation for any purpose and without fee is hereby
41 * granted, provided that the above copyright notice appears in all
42 * copies, and that both that copyright notice and this permission notice
43 * appear in supporting documentation, and that the name of Secret Labs
44 * AB or the author not be used in advertising or publicity pertaining to
45 * distribution of the software without specific, written prior
46 * permission.
47 *
48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
50 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
55 * -------------------------------------------------------------------- */
56
57#include <ctype.h>
58
59/* === Internal API ======================================================= */
60
61/* --- Internal Unicode Format -------------------------------------------- */
62
63/* Python 3.x requires unicode */
64#define Py_USING_UNICODE
65
66/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
67   properly set, but the default rules below doesn't set it.  I'll
68   sort this out some other day -- fredrik@pythonware.com */
69
70#ifndef Py_UNICODE_SIZE
71#error Must define Py_UNICODE_SIZE
72#endif
73
74/* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
75   strings are stored as UCS-2 (with limited support for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h", "wctype.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Defaults for various platforms */
87#ifndef PY_UNICODE_TYPE
88
89/* Windows has a usable wchar_t type (unless we're using UCS-4) */
90# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
91#  define HAVE_USABLE_WCHAR_T
92#  define PY_UNICODE_TYPE wchar_t
93# endif
94
95# if defined(Py_UNICODE_WIDE)
96#  define PY_UNICODE_TYPE Py_UCS4
97# endif
98
99#endif
100
101/* If the compiler provides a wchar_t type we try to support it
102   through the interface functions PyUnicode_FromWideChar() and
103   PyUnicode_AsWideChar(). */
104
105#ifdef HAVE_USABLE_WCHAR_T
106# ifndef HAVE_WCHAR_H
107#  define HAVE_WCHAR_H
108# endif
109#endif
110
111#ifdef HAVE_WCHAR_H
112/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
113# ifdef _HAVE_BSDI
114#  include <time.h>
115# endif
116#  include <wchar.h>
117#endif
118
119/*
120 * Use this typedef when you need to represent a UTF-16 surrogate pair
121 * as single unsigned integer.
122 */
123#if SIZEOF_INT >= 4
124typedef unsigned int Py_UCS4;
125#elif SIZEOF_LONG >= 4
126typedef unsigned long Py_UCS4;
127#endif
128
129typedef PY_UNICODE_TYPE Py_UNICODE;
130
131/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
132
133/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
134   produce different external names and thus cause import errors in
135   case Python interpreters and extensions with mixed compiled in
136   Unicode width assumptions are combined. */
137
138#ifndef Py_UNICODE_WIDE
139
140# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
141# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
142# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
143# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
144# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
145# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
146# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
147# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
148# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
149# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
150# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
151# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
152# define PyUnicode_Compare PyUnicodeUCS2_Compare
153# define PyUnicode_Concat PyUnicodeUCS2_Concat
154# define PyUnicode_Append PyUnicodeUCS2_Append
155# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
156# define PyUnicode_Contains PyUnicodeUCS2_Contains
157# define PyUnicode_Count PyUnicodeUCS2_Count
158# define PyUnicode_Decode PyUnicodeUCS2_Decode
159# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
160# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
161# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
162# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
163# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
164# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
165# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
166# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
167# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
168# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
169# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
170# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
171# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
172# define PyUnicode_Encode PyUnicodeUCS2_Encode
173# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
174# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
175# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
176# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
177# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
178# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
179# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
180# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
181# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
182# define PyUnicode_Find PyUnicodeUCS2_Find
183# define PyUnicode_Format PyUnicodeUCS2_Format
184# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
185# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
186# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
187# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
188# define PyUnicode_FromString PyUnicodeUCS2_FromString
189# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
190# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
191# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
192# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
193# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
194# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
195# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
196# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
197# define PyUnicode_Join PyUnicodeUCS2_Join
198# define PyUnicode_Partition PyUnicodeUCS2_Partition
199# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
200# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
201# define PyUnicode_Replace PyUnicodeUCS2_Replace
202# define PyUnicode_Resize PyUnicodeUCS2_Resize
203# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
204# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
205# define PyUnicode_Split PyUnicodeUCS2_Split
206# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
207# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
208# define PyUnicode_Translate PyUnicodeUCS2_Translate
209# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
210# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
211# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
212# define _PyUnicode_Init _PyUnicodeUCS2_Init
213# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
214# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
215# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
216# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
217# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
218# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
219# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
220# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
221# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
222# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
223# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
224# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
225# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
226# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
227# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
228# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
229# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
230# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
231
232#else
233
234# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
235# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
236# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
237# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
238# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
239# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
240# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
241# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
242# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
243# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
244# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
245# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
246# define PyUnicode_Compare PyUnicodeUCS4_Compare
247# define PyUnicode_Concat PyUnicodeUCS4_Concat
248# define PyUnicode_Append PyUnicodeUCS4_Append
249# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
250# define PyUnicode_Contains PyUnicodeUCS4_Contains
251# define PyUnicode_Count PyUnicodeUCS4_Count
252# define PyUnicode_Decode PyUnicodeUCS4_Decode
253# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
254# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
255# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
256# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
257# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
258# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
259# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
260# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
261# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
262# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
263# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
264# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
265# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
266# define PyUnicode_Encode PyUnicodeUCS4_Encode
267# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
268# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
269# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
270# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
271# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
272# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
273# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
274# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
275# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
276# define PyUnicode_Find PyUnicodeUCS4_Find
277# define PyUnicode_Format PyUnicodeUCS4_Format
278# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
279# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
280# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
281# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
282# define PyUnicode_FromString PyUnicodeUCS4_FromString
283# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
284# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
285# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
286# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
287# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
288# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
289# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
290# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
291# define PyUnicode_Join PyUnicodeUCS4_Join
292# define PyUnicode_Partition PyUnicodeUCS4_Partition
293# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
294# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
295# define PyUnicode_Replace PyUnicodeUCS4_Replace
296# define PyUnicode_Resize PyUnicodeUCS4_Resize
297# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
298# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
299# define PyUnicode_Split PyUnicodeUCS4_Split
300# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
301# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
302# define PyUnicode_Translate PyUnicodeUCS4_Translate
303# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
304# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
305# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
306# define _PyUnicode_Init _PyUnicodeUCS4_Init
307# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
308# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
309# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
310# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
311# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
312# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
313# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
314# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
315# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
316# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
317# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
318# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
319# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
320# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
321# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
322# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
323# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
324# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
325
326
327#endif
328
329/* --- Internal Unicode Operations ---------------------------------------- */
330
331/* If you want Python to use the compiler's wctype.h functions instead
332   of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
333   configure Python using --with-wctype-functions.  This reduces the
334   interpreter's code size. */
335
336#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
337
338#include <wctype.h>
339
340#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
341
342#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
343#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
344#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
345#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
346
347#define Py_UNICODE_TOLOWER(ch) towlower(ch)
348#define Py_UNICODE_TOUPPER(ch) towupper(ch)
349#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
350
351#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
352#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
353#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
354
355#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
356#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
357#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
358
359#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
360
361#else
362
363/* Since splitting on whitespace is an important use case, and whitespace
364   in most situations is solely ASCII whitespace, we optimize for the common
365   case by using a quick look-up table with an inlined check.
366 */
367extern const unsigned char _Py_ascii_whitespace[];
368
369#define Py_UNICODE_ISSPACE(ch) \
370	((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
371
372#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
373#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
374#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
375#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
376
377#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
378#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
379#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
380
381#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
382#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
383#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
384
385#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
386#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
387#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
388
389#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
390
391#endif
392
393#define Py_UNICODE_ISALNUM(ch) \
394       (Py_UNICODE_ISALPHA(ch) || \
395        Py_UNICODE_ISDECIMAL(ch) || \
396        Py_UNICODE_ISDIGIT(ch) || \
397        Py_UNICODE_ISNUMERIC(ch))
398
399#define Py_UNICODE_COPY(target, source, length)				\
400	Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
401
402#define Py_UNICODE_FILL(target, value, length) do\
403    {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
404        for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
405    } while (0)
406
407/* check if substring matches at given offset.  the offset must be
408   valid, and the substring must not be empty */
409#define Py_UNICODE_MATCH(string, offset, substring) \
410    ((*((string)->str + (offset)) == *((substring)->str)) && \
411    ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
412     !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
413
414#ifdef __cplusplus
415extern "C" {
416#endif
417
418PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
419
420/* --- Unicode Type ------------------------------------------------------- */
421
422typedef struct {
423    PyObject_HEAD
424    Py_ssize_t length;		/* Length of raw Unicode data in buffer */
425    Py_UNICODE *str;		/* Raw Unicode buffer */
426    long hash;			/* Hash value; -1 if not set */
427    int state;			/* != 0 if interned. In this case the two
428    				 * references from the dictionary to this object
429    				 * are *not* counted in ob_refcnt. */
430    PyObject *defenc;		/* (Default) Encoded version as Python
431				   string, or NULL; this is used for
432				   implementing the buffer protocol */
433} PyUnicodeObject;
434
435PyAPI_DATA(PyTypeObject) PyUnicode_Type;
436PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
437
438#define SSTATE_NOT_INTERNED 0
439#define SSTATE_INTERNED_MORTAL 1
440#define SSTATE_INTERNED_IMMORTAL 2
441
442#define PyUnicode_Check(op) \
443                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
444#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
445
446/* Fast access macros */
447#define PyUnicode_GET_SIZE(op) \
448        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
449#define PyUnicode_GET_DATA_SIZE(op) \
450        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
451#define PyUnicode_AS_UNICODE(op) \
452        (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
453#define PyUnicode_AS_DATA(op) \
454        (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
455
456/* --- Constants ---------------------------------------------------------- */
457
458/* This Unicode character will be used as replacement character during
459   decoding if the errors argument is set to "replace". Note: the
460   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
461   Unicode 3.0. */
462
463#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
464
465/* === Public API ========================================================= */
466
467/* --- Plain Py_UNICODE --------------------------------------------------- */
468
469/* Create a Unicode Object from the Py_UNICODE buffer u of the given
470   size.
471
472   u may be NULL which causes the contents to be undefined. It is the
473   user's responsibility to fill in the needed data afterwards. Note
474   that modifying the Unicode object contents after construction is
475   only allowed if u was set to NULL.
476
477   The buffer is copied into the new object. */
478
479PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
480    const Py_UNICODE *u,        /* Unicode buffer */
481    Py_ssize_t size             /* size of buffer */
482    );
483
484/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
485PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
486    const char *u,        /* char buffer */
487    Py_ssize_t size       /* size of buffer */
488    );
489
490/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
491   Latin-1 encoded bytes */
492PyAPI_FUNC(PyObject*) PyUnicode_FromString(
493    const char *u        /* string */
494    );
495
496/* Return a read-only pointer to the Unicode object's internal
497   Py_UNICODE buffer. */
498
499PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
500    PyObject *unicode	 	/* Unicode object */
501    );
502
503/* Get the length of the Unicode object. */
504
505PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
506    PyObject *unicode	 	/* Unicode object */
507    );
508
509/* Get the maximum ordinal for a Unicode character. */
510PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
511
512/* Resize an already allocated Unicode object to the new size length.
513
514   *unicode is modified to point to the new (resized) object and 0
515   returned on success.
516
517   This API may only be called by the function which also called the
518   Unicode constructor. The refcount on the object must be 1. Otherwise,
519   an error is returned.
520
521   Error handling is implemented as follows: an exception is set, -1
522   is returned and *unicode left untouched.
523
524*/
525
526PyAPI_FUNC(int) PyUnicode_Resize(
527    PyObject **unicode,		/* Pointer to the Unicode object */
528    Py_ssize_t length		/* New length */
529    );
530
531/* Coerce obj to an Unicode object and return a reference with
532   *incremented* refcount.
533
534   Coercion is done in the following way:
535
536   1. String and other char buffer compatible objects are decoded
537      under the assumptions that they contain data using the current
538      default encoding. Decoding is done in "strict" mode.
539
540   2. All other objects (including Unicode objects) raise an
541      exception.
542
543   The API returns NULL in case of an error. The caller is responsible
544   for decref'ing the returned objects.
545
546*/
547
548PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
549    register PyObject *obj, 	/* Object */
550    const char *encoding,       /* encoding */
551    const char *errors          /* error handling */
552    );
553
554/* Coerce obj to an Unicode object and return a reference with
555   *incremented* refcount.
556
557   Unicode objects are passed back as-is (subclasses are converted to
558   true Unicode objects), all other objects are delegated to
559   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
560   using the default encoding as basis for decoding the object.
561
562   The API returns NULL in case of an error. The caller is responsible
563   for decref'ing the returned objects.
564
565*/
566
567PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
568    register PyObject *obj 	/* Object */
569    );
570
571PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
572PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
573
574PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
575PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
576PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
577PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
578
579/* Use only if you know it's a string */
580#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
581
582/* --- wchar_t support for platforms which support it --------------------- */
583
584#ifdef HAVE_WCHAR_H
585
586/* Create a Unicode Object from the whcar_t buffer w of the given
587   size.
588
589   The buffer is copied into the new object. */
590
591PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
592    register const wchar_t *w,  /* wchar_t buffer */
593    Py_ssize_t size             /* size of buffer */
594    );
595
596/* Copies the Unicode Object contents into the wchar_t buffer w.  At
597   most size wchar_t characters are copied.
598
599   Note that the resulting wchar_t string may or may not be
600   0-terminated.  It is the responsibility of the caller to make sure
601   that the wchar_t string is 0-terminated in case this is required by
602   the application.
603
604   Returns the number of wchar_t characters copied (excluding a
605   possibly trailing 0-termination character) or -1 in case of an
606   error. */
607
608PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
609    PyUnicodeObject *unicode,   /* Unicode object */
610    register wchar_t *w,        /* wchar_t buffer */
611    Py_ssize_t size             /* size of buffer */
612    );
613
614#endif
615
616/* --- Unicode ordinals --------------------------------------------------- */
617
618/* Create a Unicode Object from the given Unicode code point ordinal.
619
620   The ordinal must be in range(0x10000) on narrow Python builds
621   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
622   raised in case it is not.
623
624*/
625
626PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
627
628/* === Builtin Codecs =====================================================
629
630   Many of these APIs take two arguments encoding and errors. These
631   parameters encoding and errors have the same semantics as the ones
632   of the builtin unicode() API.
633
634   Setting encoding to NULL causes the default encoding to be used.
635
636   Error handling is set by errors which may also be set to NULL
637   meaning to use the default handling defined for the codec. Default
638   error handling for all builtin codecs is "strict" (ValueErrors are
639   raised).
640
641   The codecs all use a similar interface. Only deviation from the
642   generic ones are documented.
643
644*/
645
646/* --- Manage the default encoding ---------------------------------------- */
647
648/* Return a Python string holding the default encoded value of the
649   Unicode object.
650
651   The resulting string is cached in the Unicode object for subsequent
652   usage by this function. The cached version is needed to implement
653   the character buffer interface and will live (at least) as long as
654   the Unicode object itself.
655
656   The refcount of the string is *not* incremented.
657
658   *** Exported for internal use by the interpreter only !!! ***
659
660*/
661
662PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
663    PyObject *, const char *);
664
665/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
666
667   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
668   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
669   invalid characters with '?'.
670
671   The function is intended to be used for paths and file names only
672   during bootstrapping process where the codecs are not set up.
673*/
674
675PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
676    const char *s               /* encoded string */
677    );
678
679PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
680    const char *s,               /* encoded string */
681    Py_ssize_t size              /* size */
682    );
683
684
685/* Return a char* holding the UTF-8 encoded value of the
686   Unicode object.
687
688   DEPRECATED: use PyUnicode_AsStringAndSize() instead.
689*/
690
691PyAPI_FUNC(char *) PyUnicode_AsStringAndSize(PyObject*, Py_ssize_t *);
692
693/* Returns the UTF-8 encoding, and its size.
694
695   If the output argument is NULL, no size is stored.
696 */
697
698PyAPI_FUNC(char *) PyUnicode_AsString(PyObject*);
699
700/* Returns the UTF-8 encoding.
701
702   This is equivalent to PyUnicode_AsStringAndSize(x, NULL).
703
704 */
705
706PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
707
708/* Sets the currently active default encoding.
709
710   Returns 0 on success, -1 in case of an error.
711
712 */
713
714PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
715    const char *encoding	/* Encoding name in standard form */
716    );
717
718/* --- Generic Codecs ----------------------------------------------------- */
719
720/* Create a Unicode object by decoding the encoded string s of the
721   given size. */
722
723PyAPI_FUNC(PyObject*) PyUnicode_Decode(
724    const char *s,              /* encoded string */
725    Py_ssize_t size,            /* size of buffer */
726    const char *encoding,       /* encoding */
727    const char *errors          /* error handling */
728    );
729
730/* Encodes a Py_UNICODE buffer of the given size and returns a
731   Python string object. */
732
733PyAPI_FUNC(PyObject*) PyUnicode_Encode(
734    const Py_UNICODE *s,        /* Unicode char buffer */
735    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
736    const char *encoding,       /* encoding */
737    const char *errors          /* error handling */
738    );
739
740/* Encodes a Unicode object and returns the result as Python
741   object. */
742
743PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
744    PyObject *unicode,	 	/* Unicode object */
745    const char *encoding,	/* encoding */
746    const char *errors		/* error handling */
747    );
748
749/* Encodes a Unicode object and returns the result as Python string
750   object. */
751
752PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
753    PyObject *unicode,	 	/* Unicode object */
754    const char *encoding,	/* encoding */
755    const char *errors		/* error handling */
756    );
757
758PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
759    PyObject* string            /* 256 character map */
760   );
761
762
763/* --- UTF-7 Codecs ------------------------------------------------------- */
764
765PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
766    const char *string, 	/* UTF-7 encoded string */
767    Py_ssize_t length,	 	/* size of string */
768    const char *errors		/* error handling */
769    );
770
771PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
772    const char *string, 	/* UTF-7 encoded string */
773    Py_ssize_t length,	 	/* size of string */
774    const char *errors,		/* error handling */
775    Py_ssize_t *consumed	/* bytes consumed */
776    );
777
778PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
779    const Py_UNICODE *data, 	/* Unicode char buffer */
780    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
781    int encodeSetO,             /* force the encoder to encode characters in
782                                   Set O, as described in RFC2152 */
783    int encodeWhiteSpace,       /* force the encoder to encode space, tab,
784                                   carriage return and linefeed characters */
785    const char *errors		/* error handling */
786    );
787
788/* --- UTF-8 Codecs ------------------------------------------------------- */
789
790PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
791    const char *string, 	/* UTF-8 encoded string */
792    Py_ssize_t length,	 	/* size of string */
793    const char *errors		/* error handling */
794    );
795
796PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
797    const char *string, 	/* UTF-8 encoded string */
798    Py_ssize_t length,	 	/* size of string */
799    const char *errors,		/* error handling */
800    Py_ssize_t *consumed	/* bytes consumed */
801    );
802
803PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
804    PyObject *unicode	 	/* Unicode object */
805    );
806
807PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
808    const Py_UNICODE *data, 	/* Unicode char buffer */
809    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
810    const char *errors		/* error handling */
811    );
812
813/* --- UTF-32 Codecs ------------------------------------------------------ */
814
815/* Decodes length bytes from a UTF-32 encoded buffer string and returns
816   the corresponding Unicode object.
817
818   errors (if non-NULL) defines the error handling. It defaults
819   to "strict".
820
821   If byteorder is non-NULL, the decoder starts decoding using the
822   given byte order:
823
824	*byteorder == -1: little endian
825	*byteorder == 0:  native order
826	*byteorder == 1:  big endian
827
828   In native mode, the first four bytes of the stream are checked for a
829   BOM mark. If found, the BOM mark is analysed, the byte order
830   adjusted and the BOM skipped.  In the other modes, no BOM mark
831   interpretation is done. After completion, *byteorder is set to the
832   current byte order at the end of input data.
833
834   If byteorder is NULL, the codec starts in native order mode.
835
836*/
837
838PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
839    const char *string, 	/* UTF-32 encoded string */
840    Py_ssize_t length,	 	/* size of string */
841    const char *errors,		/* error handling */
842    int *byteorder		/* pointer to byteorder to use
843				   0=native;-1=LE,1=BE; updated on
844				   exit */
845    );
846
847PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
848    const char *string, 	/* UTF-32 encoded string */
849    Py_ssize_t length,	 	/* size of string */
850    const char *errors,		/* error handling */
851    int *byteorder,		/* pointer to byteorder to use
852				   0=native;-1=LE,1=BE; updated on
853				   exit */
854    Py_ssize_t *consumed	/* bytes consumed */
855    );
856
857/* Returns a Python string using the UTF-32 encoding in native byte
858   order. The string always starts with a BOM mark.  */
859
860PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
861    PyObject *unicode	 	/* Unicode object */
862    );
863
864/* Returns a Python string object holding the UTF-32 encoded value of
865   the Unicode data.
866
867   If byteorder is not 0, output is written according to the following
868   byte order:
869
870   byteorder == -1: little endian
871   byteorder == 0:  native byte order (writes a BOM mark)
872   byteorder == 1:  big endian
873
874   If byteorder is 0, the output string will always start with the
875   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
876   prepended.
877
878*/
879
880PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
881    const Py_UNICODE *data, 	/* Unicode char buffer */
882    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
883    const char *errors,		/* error handling */
884    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
885    );
886
887/* --- UTF-16 Codecs ------------------------------------------------------ */
888
889/* Decodes length bytes from a UTF-16 encoded buffer string and returns
890   the corresponding Unicode object.
891
892   errors (if non-NULL) defines the error handling. It defaults
893   to "strict".
894
895   If byteorder is non-NULL, the decoder starts decoding using the
896   given byte order:
897
898	*byteorder == -1: little endian
899	*byteorder == 0:  native order
900	*byteorder == 1:  big endian
901
902   In native mode, the first two bytes of the stream are checked for a
903   BOM mark. If found, the BOM mark is analysed, the byte order
904   adjusted and the BOM skipped.  In the other modes, no BOM mark
905   interpretation is done. After completion, *byteorder is set to the
906   current byte order at the end of input data.
907
908   If byteorder is NULL, the codec starts in native order mode.
909
910*/
911
912PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
913    const char *string, 	/* UTF-16 encoded string */
914    Py_ssize_t length,	 	/* size of string */
915    const char *errors,		/* error handling */
916    int *byteorder		/* pointer to byteorder to use
917				   0=native;-1=LE,1=BE; updated on
918				   exit */
919    );
920
921PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
922    const char *string, 	/* UTF-16 encoded string */
923    Py_ssize_t length,	 	/* size of string */
924    const char *errors,		/* error handling */
925    int *byteorder,		/* pointer to byteorder to use
926				   0=native;-1=LE,1=BE; updated on
927				   exit */
928    Py_ssize_t *consumed	/* bytes consumed */
929    );
930
931/* Returns a Python string using the UTF-16 encoding in native byte
932   order. The string always starts with a BOM mark.  */
933
934PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
935    PyObject *unicode	 	/* Unicode object */
936    );
937
938/* Returns a Python string object holding the UTF-16 encoded value of
939   the Unicode data.
940
941   If byteorder is not 0, output is written according to the following
942   byte order:
943
944   byteorder == -1: little endian
945   byteorder == 0:  native byte order (writes a BOM mark)
946   byteorder == 1:  big endian
947
948   If byteorder is 0, the output string will always start with the
949   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
950   prepended.
951
952   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
953   UCS-2. This trick makes it possible to add full UTF-16 capabilities
954   at a later point without compromising the APIs.
955
956*/
957
958PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
959    const Py_UNICODE *data, 	/* Unicode char buffer */
960    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
961    const char *errors,		/* error handling */
962    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
963    );
964
965/* --- Unicode-Escape Codecs ---------------------------------------------- */
966
967PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
968    const char *string, 	/* Unicode-Escape encoded string */
969    Py_ssize_t length,	 	/* size of string */
970    const char *errors		/* error handling */
971    );
972
973PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
974    PyObject *unicode	 	/* Unicode object */
975    );
976
977PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
978    const Py_UNICODE *data, 	/* Unicode char buffer */
979    Py_ssize_t length	 	/* Number of Py_UNICODE chars to encode */
980    );
981
982/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
983
984PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
985    const char *string, 	/* Raw-Unicode-Escape encoded string */
986    Py_ssize_t length,	 	/* size of string */
987    const char *errors		/* error handling */
988    );
989
990PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
991    PyObject *unicode	 	/* Unicode object */
992    );
993
994PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
995    const Py_UNICODE *data, 	/* Unicode char buffer */
996    Py_ssize_t length	 	/* Number of Py_UNICODE chars to encode */
997    );
998
999/* --- Unicode Internal Codec ---------------------------------------------
1000
1001    Only for internal use in _codecsmodule.c */
1002
1003PyObject *_PyUnicode_DecodeUnicodeInternal(
1004    const char *string,
1005    Py_ssize_t length,
1006    const char *errors
1007    );
1008
1009/* --- Latin-1 Codecs -----------------------------------------------------
1010
1011   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1012
1013*/
1014
1015PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1016    const char *string, 	/* Latin-1 encoded string */
1017    Py_ssize_t length,	 	/* size of string */
1018    const char *errors		/* error handling */
1019    );
1020
1021PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1022    PyObject *unicode	 	/* Unicode object */
1023    );
1024
1025PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1026    const Py_UNICODE *data, 	/* Unicode char buffer */
1027    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1028    const char *errors		/* error handling */
1029    );
1030
1031/* --- ASCII Codecs -------------------------------------------------------
1032
1033   Only 7-bit ASCII data is excepted. All other codes generate errors.
1034
1035*/
1036
1037PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1038    const char *string, 	/* ASCII encoded string */
1039    Py_ssize_t length,	 	/* size of string */
1040    const char *errors		/* error handling */
1041    );
1042
1043PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1044    PyObject *unicode	 	/* Unicode object */
1045    );
1046
1047PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1048    const Py_UNICODE *data, 	/* Unicode char buffer */
1049    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1050    const char *errors		/* error handling */
1051    );
1052
1053/* --- Character Map Codecs -----------------------------------------------
1054
1055   This codec uses mappings to encode and decode characters.
1056
1057   Decoding mappings must map single string characters to single
1058   Unicode characters, integers (which are then interpreted as Unicode
1059   ordinals) or None (meaning "undefined mapping" and causing an
1060   error).
1061
1062   Encoding mappings must map single Unicode characters to single
1063   string characters, integers (which are then interpreted as Latin-1
1064   ordinals) or None (meaning "undefined mapping" and causing an
1065   error).
1066
1067   If a character lookup fails with a LookupError, the character is
1068   copied as-is meaning that its ordinal value will be interpreted as
1069   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1070   to contain those mappings which map characters to different code
1071   points.
1072
1073*/
1074
1075PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1076    const char *string, 	/* Encoded string */
1077    Py_ssize_t length,	 	/* size of string */
1078    PyObject *mapping,		/* character mapping
1079				   (char ordinal -> unicode ordinal) */
1080    const char *errors		/* error handling */
1081    );
1082
1083PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1084    PyObject *unicode,	 	/* Unicode object */
1085    PyObject *mapping		/* character mapping
1086				   (unicode ordinal -> char ordinal) */
1087    );
1088
1089PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1090    const Py_UNICODE *data, 	/* Unicode char buffer */
1091    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1092    PyObject *mapping,		/* character mapping
1093				   (unicode ordinal -> char ordinal) */
1094    const char *errors		/* error handling */
1095    );
1096
1097/* Translate a Py_UNICODE buffer of the given length by applying a
1098   character mapping table to it and return the resulting Unicode
1099   object.
1100
1101   The mapping table must map Unicode ordinal integers to Unicode
1102   ordinal integers or None (causing deletion of the character).
1103
1104   Mapping tables may be dictionaries or sequences. Unmapped character
1105   ordinals (ones which cause a LookupError) are left untouched and
1106   are copied as-is.
1107
1108*/
1109
1110PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1111    const Py_UNICODE *data, 	/* Unicode char buffer */
1112    Py_ssize_t length,	 	/* Number of Py_UNICODE chars to encode */
1113    PyObject *table,		/* Translate table */
1114    const char *errors		/* error handling */
1115    );
1116
1117#ifdef MS_WIN32
1118
1119/* --- MBCS codecs for Windows -------------------------------------------- */
1120
1121PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1122    const char *string,         /* MBCS encoded string */
1123    Py_ssize_t length,              /* size of string */
1124    const char *errors          /* error handling */
1125    );
1126
1127PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1128    const char *string,         /* MBCS encoded string */
1129    Py_ssize_t length,          /* size of string */
1130    const char *errors,         /* error handling */
1131    Py_ssize_t *consumed        /* bytes consumed */
1132    );
1133
1134PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1135    PyObject *unicode           /* Unicode object */
1136    );
1137
1138PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1139    const Py_UNICODE *data,     /* Unicode char buffer */
1140    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1141    const char *errors          /* error handling */
1142    );
1143
1144#endif /* MS_WIN32 */
1145
1146/* --- Decimal Encoder ---------------------------------------------------- */
1147
1148/* Takes a Unicode string holding a decimal value and writes it into
1149   an output buffer using standard ASCII digit codes.
1150
1151   The output buffer has to provide at least length+1 bytes of storage
1152   area. The output string is 0-terminated.
1153
1154   The encoder converts whitespace to ' ', decimal characters to their
1155   corresponding ASCII digit and all other Latin-1 characters except
1156   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1157   are treated as errors. This includes embedded NULL bytes.
1158
1159   Error handling is defined by the errors argument:
1160
1161      NULL or "strict": raise a ValueError
1162      "ignore": ignore the wrong characters (these are not copied to the
1163		output buffer)
1164      "replace": replaces illegal characters with '?'
1165
1166   Returns 0 on success, -1 on failure.
1167
1168*/
1169
1170PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1171    Py_UNICODE *s,		/* Unicode buffer */
1172    Py_ssize_t length,		/* Number of Py_UNICODE chars to encode */
1173    char *output,		/* Output buffer; must have size >= length */
1174    const char *errors		/* error handling */
1175    );
1176
1177/* --- Methods & Slots ----------------------------------------------------
1178
1179   These are capable of handling Unicode objects and strings on input
1180   (we refer to them as strings in the descriptions) and return
1181   Unicode objects or integers as apporpriate. */
1182
1183/* Concat two strings giving a new Unicode string. */
1184
1185PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1186    PyObject *left,	 	/* Left string */
1187    PyObject *right	 	/* Right string */
1188    );
1189
1190/* Concat two strings and put the result in *pleft
1191   (sets *pleft to NULL on error) */
1192
1193PyAPI_FUNC(void) PyUnicode_Append(
1194    PyObject **pleft,	 	/* Pointer to left string */
1195    PyObject *right	 	/* Right string */
1196    );
1197
1198/* Concat two strings, put the result in *pleft and drop the right object
1199   (sets *pleft to NULL on error) */
1200
1201PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1202    PyObject **pleft,	 	/* Pointer to left string */
1203    PyObject *right	 	/* Right string */
1204    );
1205
1206/* Split a string giving a list of Unicode strings.
1207
1208   If sep is NULL, splitting will be done at all whitespace
1209   substrings. Otherwise, splits occur at the given separator.
1210
1211   At most maxsplit splits will be done. If negative, no limit is set.
1212
1213   Separators are not included in the resulting list.
1214
1215*/
1216
1217PyAPI_FUNC(PyObject*) PyUnicode_Split(
1218    PyObject *s,		/* String to split */
1219    PyObject *sep,		/* String separator */
1220    Py_ssize_t maxsplit		/* Maxsplit count */
1221    );
1222
1223/* Dito, but split at line breaks.
1224
1225   CRLF is considered to be one line break. Line breaks are not
1226   included in the resulting list. */
1227
1228PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1229    PyObject *s,		/* String to split */
1230    int keepends		/* If true, line end markers are included */
1231    );
1232
1233/* Partition a string using a given separator. */
1234
1235PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1236    PyObject *s,		/* String to partition */
1237    PyObject *sep		/* String separator */
1238    );
1239
1240/* Partition a string using a given separator, searching from the end of the
1241   string. */
1242
1243PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1244    PyObject *s,		/* String to partition */
1245    PyObject *sep		/* String separator */
1246    );
1247
1248/* Split a string giving a list of Unicode strings.
1249
1250   If sep is NULL, splitting will be done at all whitespace
1251   substrings. Otherwise, splits occur at the given separator.
1252
1253   At most maxsplit splits will be done. But unlike PyUnicode_Split
1254   PyUnicode_RSplit splits from the end of the string. If negative,
1255   no limit is set.
1256
1257   Separators are not included in the resulting list.
1258
1259*/
1260
1261PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1262    PyObject *s,		/* String to split */
1263    PyObject *sep,		/* String separator */
1264    Py_ssize_t maxsplit		/* Maxsplit count */
1265    );
1266
1267/* Translate a string by applying a character mapping table to it and
1268   return the resulting Unicode object.
1269
1270   The mapping table must map Unicode ordinal integers to Unicode
1271   ordinal integers or None (causing deletion of the character).
1272
1273   Mapping tables may be dictionaries or sequences. Unmapped character
1274   ordinals (ones which cause a LookupError) are left untouched and
1275   are copied as-is.
1276
1277*/
1278
1279PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1280    PyObject *str,		/* String */
1281    PyObject *table,		/* Translate table */
1282    const char *errors		/* error handling */
1283    );
1284
1285/* Join a sequence of strings using the given separator and return
1286   the resulting Unicode string. */
1287
1288PyAPI_FUNC(PyObject*) PyUnicode_Join(
1289    PyObject *separator, 	/* Separator string */
1290    PyObject *seq	 	/* Sequence object */
1291    );
1292
1293/* Return 1 if substr matches str[start:end] at the given tail end, 0
1294   otherwise. */
1295
1296PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1297    PyObject *str,		/* String */
1298    PyObject *substr,		/* Prefix or Suffix string */
1299    Py_ssize_t start,		/* Start index */
1300    Py_ssize_t end,		/* Stop index */
1301    int direction		/* Tail end: -1 prefix, +1 suffix */
1302    );
1303
1304/* Return the first position of substr in str[start:end] using the
1305   given search direction or -1 if not found. -2 is returned in case
1306   an error occurred and an exception is set. */
1307
1308PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1309    PyObject *str,		/* String */
1310    PyObject *substr,		/* Substring to find */
1311    Py_ssize_t start,		/* Start index */
1312    Py_ssize_t end,		/* Stop index */
1313    int direction		/* Find direction: +1 forward, -1 backward */
1314    );
1315
1316/* Count the number of occurrences of substr in str[start:end]. */
1317
1318PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1319    PyObject *str,		/* String */
1320    PyObject *substr,		/* Substring to count */
1321    Py_ssize_t start,		/* Start index */
1322    Py_ssize_t end		/* Stop index */
1323    );
1324
1325/* Replace at most maxcount occurrences of substr in str with replstr
1326   and return the resulting Unicode object. */
1327
1328PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1329    PyObject *str,		/* String */
1330    PyObject *substr,		/* Substring to find */
1331    PyObject *replstr,		/* Substring to replace */
1332    Py_ssize_t maxcount		/* Max. number of replacements to apply;
1333				   -1 = all */
1334    );
1335
1336/* Compare two strings and return -1, 0, 1 for less than, equal,
1337   greater than resp. */
1338
1339PyAPI_FUNC(int) PyUnicode_Compare(
1340    PyObject *left,		/* Left string */
1341    PyObject *right		/* Right string */
1342    );
1343
1344PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1345    PyObject *left,
1346    const char *right
1347    );
1348
1349/* Rich compare two strings and return one of the following:
1350
1351   - NULL in case an exception was raised
1352   - Py_True or Py_False for successfuly comparisons
1353   - Py_NotImplemented in case the type combination is unknown
1354
1355   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1356   case the conversion of the arguments to Unicode fails with a
1357   UnicodeDecodeError.
1358
1359   Possible values for op:
1360
1361     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1362
1363*/
1364
1365PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1366    PyObject *left,		/* Left string */
1367    PyObject *right,		/* Right string */
1368    int op			/* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1369    );
1370
1371/* Apply a argument tuple or dictionary to a format string and return
1372   the resulting Unicode string. */
1373
1374PyAPI_FUNC(PyObject *) PyUnicode_Format(
1375    PyObject *format,		/* Format string */
1376    PyObject *args		/* Argument tuple or dictionary */
1377    );
1378
1379/* Checks whether element is contained in container and return 1/0
1380   accordingly.
1381
1382   element has to coerce to an one element Unicode string. -1 is
1383   returned in case of an error. */
1384
1385PyAPI_FUNC(int) PyUnicode_Contains(
1386    PyObject *container,	/* Container string */
1387    PyObject *element		/* Element string */
1388    );
1389
1390/* Checks whether argument is a valid identifier. */
1391
1392PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1393
1394/* Externally visible for str.strip(unicode) */
1395PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1396    PyUnicodeObject *self,
1397    int striptype,
1398    PyObject *sepobj
1399    );
1400
1401/* === Characters Type APIs =============================================== */
1402
1403/* These should not be used directly. Use the Py_UNICODE_IS* and
1404   Py_UNICODE_TO* macros instead.
1405
1406   These APIs are implemented in Objects/unicodectype.c.
1407
1408*/
1409
1410PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1411    Py_UNICODE ch 	/* Unicode character */
1412    );
1413
1414PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1415    Py_UNICODE ch 	/* Unicode character */
1416    );
1417
1418PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1419    Py_UNICODE ch 	/* Unicode character */
1420    );
1421
1422PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1423    Py_UNICODE ch 	/* Unicode character */
1424    );
1425
1426PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1427    Py_UNICODE ch 	/* Unicode character */
1428    );
1429
1430PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1431    const Py_UNICODE ch 	/* Unicode character */
1432    );
1433
1434PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1435    const Py_UNICODE ch 	/* Unicode character */
1436    );
1437
1438PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1439    Py_UNICODE ch 	/* Unicode character */
1440    );
1441
1442PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1443    Py_UNICODE ch 	/* Unicode character */
1444    );
1445
1446PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1447    Py_UNICODE ch 	/* Unicode character */
1448    );
1449
1450PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1451    Py_UNICODE ch 	/* Unicode character */
1452    );
1453
1454PyAPI_FUNC(int) _PyUnicode_ToDigit(
1455    Py_UNICODE ch 	/* Unicode character */
1456    );
1457
1458PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1459    Py_UNICODE ch 	/* Unicode character */
1460    );
1461
1462PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1463    Py_UNICODE ch 	/* Unicode character */
1464    );
1465
1466PyAPI_FUNC(int) _PyUnicode_IsDigit(
1467    Py_UNICODE ch 	/* Unicode character */
1468    );
1469
1470PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1471    Py_UNICODE ch 	/* Unicode character */
1472    );
1473
1474PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1475    Py_UNICODE ch 	/* Unicode character */
1476    );
1477
1478PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u);
1479
1480PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1481    Py_UNICODE *s1, const Py_UNICODE *s2);
1482
1483PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1484    Py_UNICODE *s1, const Py_UNICODE *s2, size_t n);
1485
1486PyAPI_FUNC(int) Py_UNICODE_strcmp(
1487    const Py_UNICODE *s1, const Py_UNICODE *s2);
1488
1489PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1490    const Py_UNICODE *s, Py_UNICODE c
1491    );
1492
1493#ifdef __cplusplus
1494}
1495#endif
1496#endif /* !Py_UNICODEOBJECT_H */
1497