unicodeobject.h revision ece58deb9fd72674b84ef7a01c944b5eed6b37a1
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75   for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Py_UNICODE was the native Unicode storage format (code unit) used by
87   Python and represents a single Unicode element in the Unicode type.
88   With PEP 393, Py_UNICODE is deprecated and replaced with a
89   typedef to wchar_t. */
90
91#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97   through the interface functions PyUnicode_FromWideChar(),
98   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
101# ifndef HAVE_WCHAR_H
102#  define HAVE_WCHAR_H
103# endif
104#endif
105
106#if defined(MS_WINDOWS)
107#  define HAVE_MBCS
108#endif
109
110#ifdef HAVE_WCHAR_H
111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113#  include <time.h>
114# endif
115#  include <wchar.h>
116#endif
117
118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
119   unicode representations. */
120#if SIZEOF_INT == 4
121typedef unsigned int Py_UCS4;
122#elif SIZEOF_LONG == 4
123typedef unsigned long Py_UCS4;
124#else
125#error "Could not find a proper typedef for Py_UCS4"
126#endif
127
128#if SIZEOF_SHORT == 2
129typedef unsigned short Py_UCS2;
130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
134typedef unsigned char Py_UCS1;
135
136/* --- Internal Unicode Operations ---------------------------------------- */
137
138/* Since splitting on whitespace is an important use case, and
139   whitespace in most situations is solely ASCII whitespace, we
140   optimize for the common case by using a quick look-up table
141   _Py_ascii_whitespace (see below) with an inlined check.
142
143 */
144#ifndef Py_LIMITED_API
145#define Py_UNICODE_ISSPACE(ch) \
146    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
167
168#define Py_UNICODE_ISALNUM(ch) \
169       (Py_UNICODE_ISALPHA(ch) || \
170    Py_UNICODE_ISDECIMAL(ch) || \
171    Py_UNICODE_ISDIGIT(ch) || \
172    Py_UNICODE_ISNUMERIC(ch))
173
174#define Py_UNICODE_COPY(target, source, length) \
175    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
176
177#define Py_UNICODE_FILL(target, value, length) \
178    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
179    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
180    } while (0)
181
182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low)  \
188    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
189      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
190/* high surrogate = top 10 bits added to D800 */
191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 | (((ch) - 0x10000) >> 10))
192/* low surrogate = bottom 10 bits added to DC00 */
193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 | (((ch) - 0x10000) & 0x3FF))
194
195/* Check if substring matches at given offset.  The offset must be
196   valid, and the substring must not be empty. */
197
198#define Py_UNICODE_MATCH(string, offset, substring) \
199    ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200     ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201     !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
203#endif /* Py_LIMITED_API */
204
205#ifdef __cplusplus
206extern "C" {
207#endif
208
209/* --- Unicode Type ------------------------------------------------------- */
210
211#ifndef Py_LIMITED_API
212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214   structure. state.ascii and state.compact are set, and the data
215   immediately follow the structure. utf8_length and wstr_length can be found
216   in the length field; the utf8 pointer is equal to the data pointer. */
217typedef struct {
218    /* There are 4 forms of Unicode strings:
219
220       - compact ascii:
221
222         * structure = PyASCIIObject
223         * test: PyUnicode_IS_COMPACT_ASCII(op)
224         * kind = PyUnicode_1BYTE_KIND
225         * compact = 1
226         * ascii = 1
227         * ready = 1
228         * (length is the length of the utf8 and wstr strings)
229         * (data starts just after the structure)
230         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
231
232       - compact:
233
234         * structure = PyCompactUnicodeObject
235         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
236         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
237           PyUnicode_4BYTE_KIND
238         * compact = 1
239         * ready = 1
240         * ascii = 0
241         * utf8 is not shared with data
242         * utf8_length = 0 if utf8 is NULL
243         * wstr is shared with data and wstr_length=length
244           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
245           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
246         * wstr_length = 0 if wstr is NULL
247         * (data starts just after the structure)
248
249       - legacy string, not ready:
250
251         * structure = PyUnicodeObject
252         * test: kind == PyUnicode_WCHAR_KIND
253         * length = 0 (use wstr_length)
254         * hash = -1
255         * kind = PyUnicode_WCHAR_KIND
256         * compact = 0
257         * ascii = 0
258         * ready = 0
259         * interned = SSTATE_NOT_INTERNED
260         * wstr is not NULL
261         * data.any is NULL
262         * utf8 is NULL
263         * utf8_length = 0
264
265       - legacy string, ready:
266
267         * structure = PyUnicodeObject structure
268         * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
269         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
270           PyUnicode_4BYTE_KIND
271         * compact = 0
272         * ready = 1
273         * data.any is not NULL
274         * utf8 is shared and utf8_length = length with data.any if ascii = 1
275         * utf8_length = 0 if utf8 is NULL
276         * wstr is shared with data.any and wstr_length = length
277           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
278           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
279         * wstr_length = 0 if wstr is NULL
280
281       Compact strings use only one memory block (structure + characters),
282       whereas legacy strings use one block for the structure and one block
283       for characters.
284
285       Legacy strings are created by PyUnicode_FromUnicode() and
286       PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
287       when PyUnicode_READY() is called.
288
289       See also _PyUnicode_CheckConsistency().
290    */
291    PyObject_HEAD
292    Py_ssize_t length;          /* Number of code points in the string */
293    Py_hash_t hash;             /* Hash value; -1 if not set */
294    struct {
295        /*
296           SSTATE_NOT_INTERNED (0)
297           SSTATE_INTERNED_MORTAL (1)
298           SSTATE_INTERNED_IMMORTAL (2)
299
300           If interned != SSTATE_NOT_INTERNED, the two references from the
301           dictionary to this object are *not* counted in ob_refcnt.
302         */
303        unsigned int interned:2;
304        /* Character size:
305
306           - PyUnicode_WCHAR_KIND (0):
307
308             * character type = wchar_t (16 or 32 bits, depending on the
309               platform)
310
311           - PyUnicode_1BYTE_KIND (1):
312
313             * character type = Py_UCS1 (8 bits, unsigned)
314             * all characters are in the range U+0000-U+00FF (latin1)
315             * if ascii is set, all characters are in the range U+0000-U+007F
316               (ASCII), otherwise at least one character is in the range
317               U+0080-U+00FF
318
319           - PyUnicode_2BYTE_KIND (2):
320
321             * character type = Py_UCS2 (16 bits, unsigned)
322             * all characters are in the range U+0000-U+FFFF (BMP)
323             * at least one character is in the range U+0100-U+FFFF
324
325           - PyUnicode_4BYTE_KIND (4):
326
327             * character type = Py_UCS4 (32 bits, unsigned)
328             * all characters are in the range U+0000-U+10FFFF
329             * at least one character is in the range U+10000-U+10FFFF
330         */
331        unsigned int kind:3;
332        /* Compact is with respect to the allocation scheme. Compact unicode
333           objects only require one memory block while non-compact objects use
334           one block for the PyUnicodeObject struct and another for its data
335           buffer. */
336        unsigned int compact:1;
337        /* The string only contains characters in the range U+0000-U+007F (ASCII)
338           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
339           set, use the PyASCIIObject structure. */
340        unsigned int ascii:1;
341        /* The ready flag indicates whether the object layout is initialized
342           completely. This means that this is either a compact object, or
343           the data pointer is filled out. The bit is redundant, and helps
344           to minimize the test in PyUnicode_IS_READY(). */
345        unsigned int ready:1;
346    } state;
347    wchar_t *wstr;              /* wchar_t representation (null-terminated) */
348} PyASCIIObject;
349
350/* Non-ASCII strings allocated through PyUnicode_New use the
351   PyCompactUnicodeObject structure. state.compact is set, and the data
352   immediately follow the structure. */
353typedef struct {
354    PyASCIIObject _base;
355    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
356                                 * terminating \0. */
357    char *utf8;                 /* UTF-8 representation (null-terminated) */
358    Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
359                                 * surrogates count as two code points. */
360} PyCompactUnicodeObject;
361
362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
363   PyUnicodeObject structure. The actual string data is initially in the wstr
364   block, and copied into the data block using _PyUnicode_Ready. */
365typedef struct {
366    PyCompactUnicodeObject _base;
367    union {
368        void *any;
369        Py_UCS1 *latin1;
370        Py_UCS2 *ucs2;
371        Py_UCS4 *ucs4;
372    } data;                     /* Canonical, smallest-form Unicode buffer */
373} PyUnicodeObject;
374#endif
375
376PyAPI_DATA(PyTypeObject) PyUnicode_Type;
377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
378
379#define PyUnicode_Check(op) \
380                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
382
383/* Fast access macros */
384#ifndef Py_LIMITED_API
385
386#define PyUnicode_WSTR_LENGTH(op) \
387    (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
388     ((PyASCIIObject*)op)->length :                    \
389     ((PyCompactUnicodeObject*)op)->wstr_length)
390
391/* Returns the deprecated Py_UNICODE representation's size in code units
392   (this includes surrogate pairs as 2 units).
393   If the Py_UNICODE representation is not available, it will be computed
394   on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
395
396#define PyUnicode_GET_SIZE(op)                       \
397    (assert(PyUnicode_Check(op)),                    \
398     (((PyASCIIObject *)(op))->wstr) ?               \
399      PyUnicode_WSTR_LENGTH(op) :                    \
400      ((void)PyUnicode_AsUnicode((PyObject *)(op)),  \
401       assert(((PyASCIIObject *)(op))->wstr),        \
402       PyUnicode_WSTR_LENGTH(op)))
403
404#define PyUnicode_GET_DATA_SIZE(op) \
405    (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
406
407/* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
408   representation on demand.  Using this macro is very inefficient now,
409   try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
410   use PyUnicode_WRITE() and PyUnicode_READ(). */
411
412#define PyUnicode_AS_UNICODE(op) \
413    (assert(PyUnicode_Check(op)), \
414     (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
415      PyUnicode_AsUnicode((PyObject *)(op)))
416
417#define PyUnicode_AS_DATA(op) \
418    ((const char *)(PyUnicode_AS_UNICODE(op)))
419
420
421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
422
423/* Values for PyASCIIObject.state: */
424
425/* Interning state. */
426#define SSTATE_NOT_INTERNED 0
427#define SSTATE_INTERNED_MORTAL 1
428#define SSTATE_INTERNED_IMMORTAL 2
429
430/* Return true if the string contains only ASCII characters, or 0 if not. The
431   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
432   ready. */
433#define PyUnicode_IS_ASCII(op)                   \
434    (assert(PyUnicode_Check(op)),                \
435     assert(PyUnicode_IS_READY(op)),             \
436     ((PyASCIIObject*)op)->state.ascii)
437
438/* Return true if the string is compact or 0 if not.
439   No type checks or Ready calls are performed. */
440#define PyUnicode_IS_COMPACT(op) \
441    (((PyASCIIObject*)(op))->state.compact)
442
443/* Return true if the string is a compact ASCII string (use PyASCIIObject
444   structure), or 0 if not.  No type checks or Ready calls are performed. */
445#define PyUnicode_IS_COMPACT_ASCII(op)                 \
446    (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
447
448enum PyUnicode_Kind {
449/* String contains only wstr byte characters.  This is only possible
450   when the string was created with a legacy API and _PyUnicode_Ready()
451   has not been called yet.  */
452    PyUnicode_WCHAR_KIND = 0,
453/* Return values of the PyUnicode_KIND() macro: */
454    PyUnicode_1BYTE_KIND = 1,
455    PyUnicode_2BYTE_KIND = 2,
456    PyUnicode_4BYTE_KIND = 4
457};
458
459/* Return pointers to the canonical representation cast to unsigned char,
460   Py_UCS2, or Py_UCS4 for direct character access.
461   No checks are performed, use PyUnicode_KIND() before to ensure
462   these will work correctly. */
463
464#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
465#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
466#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
467
468/* Return one of the PyUnicode_*_KIND values defined above. */
469#define PyUnicode_KIND(op) \
470    (assert(PyUnicode_Check(op)), \
471     assert(PyUnicode_IS_READY(op)),            \
472     ((PyASCIIObject *)(op))->state.kind)
473
474/* Return a void pointer to the raw unicode buffer. */
475#define _PyUnicode_COMPACT_DATA(op)                     \
476    (PyUnicode_IS_ASCII(op) ?                   \
477     ((void*)((PyASCIIObject*)(op) + 1)) :              \
478     ((void*)((PyCompactUnicodeObject*)(op) + 1)))
479
480#define _PyUnicode_NONCOMPACT_DATA(op)                  \
481    (assert(((PyUnicodeObject*)(op))->data.any),        \
482     ((((PyUnicodeObject *)(op))->data.any)))
483
484#define PyUnicode_DATA(op) \
485    (assert(PyUnicode_Check(op)), \
486     PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
487     _PyUnicode_NONCOMPACT_DATA(op))
488
489/* In the access macros below, "kind" may be evaluated more than once.
490   All other macro parameters are evaluated exactly once, so it is safe
491   to put side effects into them (such as increasing the index). */
492
493/* Write into the canonical representation, this macro does not do any sanity
494   checks and is intended for usage in loops.  The caller should cache the
495   kind and data pointers obtained from other macro calls.
496   index is the index in the string (starts at 0) and value is the new
497   code point value which should be written to that location. */
498#define PyUnicode_WRITE(kind, data, index, value) \
499    do { \
500        switch ((kind)) { \
501        case PyUnicode_1BYTE_KIND: { \
502            ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
503            break; \
504        } \
505        case PyUnicode_2BYTE_KIND: { \
506            ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
507            break; \
508        } \
509        default: { \
510            assert((kind) == PyUnicode_4BYTE_KIND); \
511            ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
512        } \
513        } \
514    } while (0)
515
516/* Read a code point from the string's canonical representation.  No checks
517   or ready calls are performed. */
518#define PyUnicode_READ(kind, data, index) \
519    ((Py_UCS4) \
520    ((kind) == PyUnicode_1BYTE_KIND ? \
521        ((const Py_UCS1 *)(data))[(index)] : \
522        ((kind) == PyUnicode_2BYTE_KIND ? \
523            ((const Py_UCS2 *)(data))[(index)] : \
524            ((const Py_UCS4 *)(data))[(index)] \
525        ) \
526    ))
527
528/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
529   calls PyUnicode_KIND() and might call it twice.  For single reads, use
530   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
531   cache kind and use PyUnicode_READ instead. */
532#define PyUnicode_READ_CHAR(unicode, index) \
533    (assert(PyUnicode_Check(unicode)),          \
534     assert(PyUnicode_IS_READY(unicode)),       \
535     (Py_UCS4)                                  \
536        (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
537            ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
538            (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
539                ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
540                ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
541            ) \
542        ))
543
544/* Returns the length of the unicode string. The caller has to make sure that
545   the string has it's canonical representation set before calling
546   this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
547#define PyUnicode_GET_LENGTH(op)                \
548    (assert(PyUnicode_Check(op)),               \
549     assert(PyUnicode_IS_READY(op)),            \
550     ((PyASCIIObject *)(op))->length)
551
552
553/* Fast check to determine whether an object is ready. Equivalent to
554   PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
555
556#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
557
558/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
559   case.  If the canonical representation is not yet set, it will still call
560   _PyUnicode_Ready().
561   Returns 0 on success and -1 on errors. */
562#define PyUnicode_READY(op)                        \
563    (assert(PyUnicode_Check(op)),                       \
564     (PyUnicode_IS_READY(op) ?                          \
565      0 : _PyUnicode_Ready((PyObject *)(op))))
566
567/* Return a maximum character value which is suitable for creating another
568   string based on op.  This is always an approximation but more efficient
569   than iterating over the string. */
570#define PyUnicode_MAX_CHAR_VALUE(op) \
571    (assert(PyUnicode_IS_READY(op)),                                    \
572     (PyUnicode_IS_ASCII(op) ?                                          \
573      (0x7f) :                                                          \
574      (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
575       (0xffU) :                                                        \
576       (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
577        (0xffffU) :                                                     \
578        (0x10ffffU)))))
579
580#endif
581
582/* --- Constants ---------------------------------------------------------- */
583
584/* This Unicode character will be used as replacement character during
585   decoding if the errors argument is set to "replace". Note: the
586   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
587   Unicode 3.0. */
588
589#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
590
591/* === Public API ========================================================= */
592
593/* --- Plain Py_UNICODE --------------------------------------------------- */
594
595/* With PEP 393, this is the recommended way to allocate a new unicode object.
596   This function will allocate the object and its buffer in a single memory
597   block.  Objects created using this function are not resizable. */
598#ifndef Py_LIMITED_API
599PyAPI_FUNC(PyObject*) PyUnicode_New(
600    Py_ssize_t size,            /* Number of code points in the new string */
601    Py_UCS4 maxchar             /* maximum code point value in the string */
602    );
603#endif
604
605/* Initializes the canonical string representation from a the deprecated
606   wstr/Py_UNICODE representation. This function is used to convert Unicode
607   objects which were created using the old API to the new flexible format
608   introduced with PEP 393.
609
610   Don't call this function directly, use the public PyUnicode_READY() macro
611   instead. */
612#ifndef Py_LIMITED_API
613PyAPI_FUNC(int) _PyUnicode_Ready(
614    PyObject *unicode           /* Unicode object */
615    );
616#endif
617
618/* Get a copy of a Unicode string. */
619#ifndef Py_LIMITED_API
620PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
621    PyObject *unicode
622    );
623#endif
624
625/* Copy character from one unicode object into another, this function performs
626   character conversion when necessary and falls back to memcpy() if possible.
627
628   Fail if to is too small (smaller than *how_many* or smaller than
629   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
630   kind(to), or if *to* has more than 1 reference.
631
632   Return the number of written character, or return -1 and raise an exception
633   on error.
634
635   Pseudo-code:
636
637       how_many = min(how_many, len(from) - from_start)
638       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
639       return how_many
640
641   Note: The function doesn't write a terminating null character.
642   */
643#ifndef Py_LIMITED_API
644PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
645    PyObject *to,
646    Py_ssize_t to_start,
647    PyObject *from,
648    Py_ssize_t from_start,
649    Py_ssize_t how_many
650    );
651#endif
652
653/* Fill a string with a character: write fill_char into
654   unicode[start:start+length].
655
656   Fail if fill_char is bigger than the string maximum character, or if the
657   string has more than 1 reference.
658
659   Return the number of written character, or return -1 and raise an exception
660   on error. */
661#ifndef Py_LIMITED_API
662PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
663    PyObject *unicode,
664    Py_ssize_t start,
665    Py_ssize_t length,
666    Py_UCS4 fill_char
667    );
668#endif
669
670/* Create a Unicode Object from the Py_UNICODE buffer u of the given
671   size.
672
673   u may be NULL which causes the contents to be undefined. It is the
674   user's responsibility to fill in the needed data afterwards. Note
675   that modifying the Unicode object contents after construction is
676   only allowed if u was set to NULL.
677
678   The buffer is copied into the new object. */
679
680#ifndef Py_LIMITED_API
681PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
682    const Py_UNICODE *u,        /* Unicode buffer */
683    Py_ssize_t size             /* size of buffer */
684    );
685#endif
686
687/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
688PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
689    const char *u,             /* UTF-8 encoded string */
690    Py_ssize_t size            /* size of buffer */
691    );
692
693/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
694   UTF-8 encoded bytes.  The size is determined with strlen(). */
695PyAPI_FUNC(PyObject*) PyUnicode_FromString(
696    const char *u              /* UTF-8 encoded string */
697    );
698
699/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
700   Scan the string to find the maximum character. */
701#ifndef Py_LIMITED_API
702PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
703    int kind,
704    const void *buffer,
705    Py_ssize_t size);
706#endif
707
708PyAPI_FUNC(PyObject*) PyUnicode_Substring(
709    PyObject *str,
710    Py_ssize_t start,
711    Py_ssize_t end);
712
713#ifndef Py_LIMITED_API
714/* Compute the maximum character of the substring unicode[start:end].
715   Return 127 for an empty string. */
716PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
717    PyObject *unicode,
718    Py_ssize_t start,
719    Py_ssize_t end);
720#endif
721
722/* Copy the string into a UCS4 buffer including the null character if copy_null
723   is set. Return NULL and raise an exception on error. Raise a ValueError if
724   the buffer is smaller than the string. Return buffer on success.
725
726   buflen is the length of the buffer in (Py_UCS4) characters. */
727PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
728    PyObject *unicode,
729    Py_UCS4* buffer,
730    Py_ssize_t buflen,
731    int copy_null);
732
733/* Copy the string into a UCS4 buffer. A new buffer is allocated using
734 * PyMem_Malloc; if this fails, NULL is returned with a memory error
735   exception set. */
736PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
737
738/* Return a read-only pointer to the Unicode object's internal
739   Py_UNICODE buffer.
740   If the wchar_t/Py_UNICODE representation is not yet available, this
741   function will calculate it. */
742
743#ifndef Py_LIMITED_API
744PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
745    PyObject *unicode           /* Unicode object */
746    );
747#endif
748
749/* Return a read-only pointer to the Unicode object's internal
750   Py_UNICODE buffer and save the length at size.
751   If the wchar_t/Py_UNICODE representation is not yet available, this
752   function will calculate it. */
753
754#ifndef Py_LIMITED_API
755PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
756    PyObject *unicode,          /* Unicode object */
757    Py_ssize_t *size            /* location where to save the length */
758    );
759#endif
760
761/* Get the length of the Unicode object. */
762
763PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
764    PyObject *unicode
765);
766
767/* Get the number of Py_UNICODE units in the
768   string representation. */
769
770PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
771    PyObject *unicode           /* Unicode object */
772    );
773
774/* Read a character from the string. */
775
776PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
777    PyObject *unicode,
778    Py_ssize_t index
779    );
780
781/* Write a character to the string. The string must have been created through
782   PyUnicode_New, must not be shared, and must not have been hashed yet.
783
784   Return 0 on success, -1 on error. */
785
786PyAPI_FUNC(int) PyUnicode_WriteChar(
787    PyObject *unicode,
788    Py_ssize_t index,
789    Py_UCS4 character
790    );
791
792#ifndef Py_LIMITED_API
793/* Get the maximum ordinal for a Unicode character. */
794PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
795#endif
796
797/* Resize an Unicode object. The length is the number of characters, except
798   if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
799   is the number of Py_UNICODE characters.
800
801   *unicode is modified to point to the new (resized) object and 0
802   returned on success.
803
804   Try to resize the string in place (which is usually faster than allocating
805   a new string and copy characters), or create a new string.
806
807   Error handling is implemented as follows: an exception is set, -1
808   is returned and *unicode left untouched.
809
810   WARNING: The function doesn't check string content, the result may not be a
811            string in canonical representation. */
812
813PyAPI_FUNC(int) PyUnicode_Resize(
814    PyObject **unicode,         /* Pointer to the Unicode object */
815    Py_ssize_t length           /* New length */
816    );
817
818/* Coerce obj to an Unicode object and return a reference with
819   *incremented* refcount.
820
821   Coercion is done in the following way:
822
823   1. bytes, bytearray and other char buffer compatible objects are decoded
824      under the assumptions that they contain data using the UTF-8
825      encoding. Decoding is done in "strict" mode.
826
827   2. All other objects (including Unicode objects) raise an
828      exception.
829
830   The API returns NULL in case of an error. The caller is responsible
831   for decref'ing the returned objects.
832
833*/
834
835PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
836    register PyObject *obj,     /* Object */
837    const char *encoding,       /* encoding */
838    const char *errors          /* error handling */
839    );
840
841/* Coerce obj to an Unicode object and return a reference with
842   *incremented* refcount.
843
844   Unicode objects are passed back as-is (subclasses are converted to
845   true Unicode objects), all other objects are delegated to
846   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
847   using UTF-8 encoding as basis for decoding the object.
848
849   The API returns NULL in case of an error. The caller is responsible
850   for decref'ing the returned objects.
851
852*/
853
854PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
855    register PyObject *obj      /* Object */
856    );
857
858PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
859    const char *format,   /* ASCII-encoded string  */
860    va_list vargs
861    );
862PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
863    const char *format,   /* ASCII-encoded string  */
864    ...
865    );
866
867#ifndef Py_LIMITED_API
868/* Format the object based on the format_spec, as defined in PEP 3101
869   (Advanced String Formatting). */
870PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
871                                                 PyObject *format_spec,
872                                                 Py_ssize_t start,
873                                                 Py_ssize_t end);
874#endif
875
876PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
877PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
878PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
879    const char *u              /* UTF-8 encoded string */
880    );
881#ifndef Py_LIMITED_API
882PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
883#endif
884
885/* Use only if you know it's a string */
886#define PyUnicode_CHECK_INTERNED(op) \
887    (((PyASCIIObject *)(op))->state.interned)
888
889/* --- wchar_t support for platforms which support it --------------------- */
890
891#ifdef HAVE_WCHAR_H
892
893/* Create a Unicode Object from the wchar_t buffer w of the given
894   size.
895
896   The buffer is copied into the new object. */
897
898PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
899    register const wchar_t *w,  /* wchar_t buffer */
900    Py_ssize_t size             /* size of buffer */
901    );
902
903/* Copies the Unicode Object contents into the wchar_t buffer w.  At
904   most size wchar_t characters are copied.
905
906   Note that the resulting wchar_t string may or may not be
907   0-terminated.  It is the responsibility of the caller to make sure
908   that the wchar_t string is 0-terminated in case this is required by
909   the application.
910
911   Returns the number of wchar_t characters copied (excluding a
912   possibly trailing 0-termination character) or -1 in case of an
913   error. */
914
915PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
916    PyObject *unicode,          /* Unicode object */
917    register wchar_t *w,        /* wchar_t buffer */
918    Py_ssize_t size             /* size of buffer */
919    );
920
921/* Convert the Unicode object to a wide character string. The output string
922   always ends with a nul character. If size is not NULL, write the number of
923   wide characters (excluding the null character) into *size.
924
925   Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
926   on success. On error, returns NULL, *size is undefined and raises a
927   MemoryError. */
928
929PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
930    PyObject *unicode,          /* Unicode object */
931    Py_ssize_t *size            /* number of characters of the result */
932    );
933
934#ifndef Py_LIMITED_API
935PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
936#endif
937
938#endif
939
940/* --- Unicode ordinals --------------------------------------------------- */
941
942/* Create a Unicode Object from the given Unicode code point ordinal.
943
944   The ordinal must be in range(0x10000) on narrow Python builds
945   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
946   raised in case it is not.
947
948*/
949
950PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
951
952/* --- Free-list management ----------------------------------------------- */
953
954/* Clear the free list used by the Unicode implementation.
955
956   This can be used to release memory used for objects on the free
957   list back to the Python memory allocator.
958
959*/
960
961PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
962
963/* === Builtin Codecs =====================================================
964
965   Many of these APIs take two arguments encoding and errors. These
966   parameters encoding and errors have the same semantics as the ones
967   of the builtin str() API.
968
969   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
970
971   Error handling is set by errors which may also be set to NULL
972   meaning to use the default handling defined for the codec. Default
973   error handling for all builtin codecs is "strict" (ValueErrors are
974   raised).
975
976   The codecs all use a similar interface. Only deviation from the
977   generic ones are documented.
978
979*/
980
981/* --- Manage the default encoding ---------------------------------------- */
982
983/* Returns a pointer to the default encoding (UTF-8) of the
984   Unicode object unicode and the size of the encoded representation
985   in bytes stored in *size.
986
987   In case of an error, no *size is set.
988
989   This function caches the UTF-8 encoded string in the unicodeobject
990   and subsequent calls will return the same string.  The memory is released
991   when the unicodeobject is deallocated.
992
993   _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
994   support the previous internal function with the same behaviour.
995
996   *** This API is for interpreter INTERNAL USE ONLY and will likely
997   *** be removed or changed in the future.
998
999   *** If you need to access the Unicode object as UTF-8 bytes string,
1000   *** please use PyUnicode_AsUTF8String() instead.
1001*/
1002
1003#ifndef Py_LIMITED_API
1004PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
1005    PyObject *unicode,
1006    Py_ssize_t *size);
1007#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
1008#endif
1009
1010/* Returns a pointer to the default encoding (UTF-8) of the
1011   Unicode object unicode.
1012
1013   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1014   in the unicodeobject.
1015
1016   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1017   support the previous internal function with the same behaviour.
1018
1019   Use of this API is DEPRECATED since no size information can be
1020   extracted from the returned data.
1021
1022   *** This API is for interpreter INTERNAL USE ONLY and will likely
1023   *** be removed or changed for Python 3.1.
1024
1025   *** If you need to access the Unicode object as UTF-8 bytes string,
1026   *** please use PyUnicode_AsUTF8String() instead.
1027
1028*/
1029
1030#ifndef Py_LIMITED_API
1031PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1032#define _PyUnicode_AsString PyUnicode_AsUTF8
1033#endif
1034
1035/* Returns "utf-8".  */
1036
1037PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
1038
1039/* --- Generic Codecs ----------------------------------------------------- */
1040
1041/* Create a Unicode object by decoding the encoded string s of the
1042   given size. */
1043
1044PyAPI_FUNC(PyObject*) PyUnicode_Decode(
1045    const char *s,              /* encoded string */
1046    Py_ssize_t size,            /* size of buffer */
1047    const char *encoding,       /* encoding */
1048    const char *errors          /* error handling */
1049    );
1050
1051/* Decode a Unicode object unicode and return the result as Python
1052   object. */
1053
1054PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
1055    PyObject *unicode,          /* Unicode object */
1056    const char *encoding,       /* encoding */
1057    const char *errors          /* error handling */
1058    );
1059
1060/* Decode a Unicode object unicode and return the result as Unicode
1061   object. */
1062
1063PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
1064    PyObject *unicode,          /* Unicode object */
1065    const char *encoding,       /* encoding */
1066    const char *errors          /* error handling */
1067    );
1068
1069/* Encodes a Py_UNICODE buffer of the given size and returns a
1070   Python string object. */
1071
1072#ifndef Py_LIMITED_API
1073PyAPI_FUNC(PyObject*) PyUnicode_Encode(
1074    const Py_UNICODE *s,        /* Unicode char buffer */
1075    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
1076    const char *encoding,       /* encoding */
1077    const char *errors          /* error handling */
1078    );
1079#endif
1080
1081/* Encodes a Unicode object and returns the result as Python
1082   object. */
1083
1084PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
1085    PyObject *unicode,          /* Unicode object */
1086    const char *encoding,       /* encoding */
1087    const char *errors          /* error handling */
1088    );
1089
1090/* Encodes a Unicode object and returns the result as Python string
1091   object. */
1092
1093PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
1094    PyObject *unicode,          /* Unicode object */
1095    const char *encoding,       /* encoding */
1096    const char *errors          /* error handling */
1097    );
1098
1099/* Encodes a Unicode object and returns the result as Unicode
1100   object. */
1101
1102PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
1103    PyObject *unicode,          /* Unicode object */
1104    const char *encoding,       /* encoding */
1105    const char *errors          /* error handling */
1106    );
1107
1108/* Build an encoding map. */
1109
1110PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1111    PyObject* string            /* 256 character map */
1112   );
1113
1114/* --- UTF-7 Codecs ------------------------------------------------------- */
1115
1116PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
1117    const char *string,         /* UTF-7 encoded string */
1118    Py_ssize_t length,          /* size of string */
1119    const char *errors          /* error handling */
1120    );
1121
1122PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
1123    const char *string,         /* UTF-7 encoded string */
1124    Py_ssize_t length,          /* size of string */
1125    const char *errors,         /* error handling */
1126    Py_ssize_t *consumed        /* bytes consumed */
1127    );
1128
1129#ifndef Py_LIMITED_API
1130PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
1131    const Py_UNICODE *data,     /* Unicode char buffer */
1132    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1133    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1134    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1135    const char *errors          /* error handling */
1136    );
1137PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1138    PyObject *unicode,          /* Unicode object */
1139    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1140    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1141    const char *errors          /* error handling */
1142    );
1143#endif
1144
1145/* --- UTF-8 Codecs ------------------------------------------------------- */
1146
1147PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
1148    const char *string,         /* UTF-8 encoded string */
1149    Py_ssize_t length,          /* size of string */
1150    const char *errors          /* error handling */
1151    );
1152
1153PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
1154    const char *string,         /* UTF-8 encoded string */
1155    Py_ssize_t length,          /* size of string */
1156    const char *errors,         /* error handling */
1157    Py_ssize_t *consumed        /* bytes consumed */
1158    );
1159
1160PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
1161    PyObject *unicode           /* Unicode object */
1162    );
1163
1164#ifndef Py_LIMITED_API
1165PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1166    PyObject *unicode,
1167    const char *errors);
1168
1169PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
1170    const Py_UNICODE *data,     /* Unicode char buffer */
1171    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1172    const char *errors          /* error handling */
1173    );
1174#endif
1175
1176/* --- UTF-32 Codecs ------------------------------------------------------ */
1177
1178/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1179   the corresponding Unicode object.
1180
1181   errors (if non-NULL) defines the error handling. It defaults
1182   to "strict".
1183
1184   If byteorder is non-NULL, the decoder starts decoding using the
1185   given byte order:
1186
1187    *byteorder == -1: little endian
1188    *byteorder == 0:  native order
1189    *byteorder == 1:  big endian
1190
1191   In native mode, the first four bytes of the stream are checked for a
1192   BOM mark. If found, the BOM mark is analysed, the byte order
1193   adjusted and the BOM skipped.  In the other modes, no BOM mark
1194   interpretation is done. After completion, *byteorder is set to the
1195   current byte order at the end of input data.
1196
1197   If byteorder is NULL, the codec starts in native order mode.
1198
1199*/
1200
1201PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
1202    const char *string,         /* UTF-32 encoded string */
1203    Py_ssize_t length,          /* size of string */
1204    const char *errors,         /* error handling */
1205    int *byteorder              /* pointer to byteorder to use
1206                                   0=native;-1=LE,1=BE; updated on
1207                                   exit */
1208    );
1209
1210PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
1211    const char *string,         /* UTF-32 encoded string */
1212    Py_ssize_t length,          /* size of string */
1213    const char *errors,         /* error handling */
1214    int *byteorder,             /* pointer to byteorder to use
1215                                   0=native;-1=LE,1=BE; updated on
1216                                   exit */
1217    Py_ssize_t *consumed        /* bytes consumed */
1218    );
1219
1220/* Returns a Python string using the UTF-32 encoding in native byte
1221   order. The string always starts with a BOM mark.  */
1222
1223PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
1224    PyObject *unicode           /* Unicode object */
1225    );
1226
1227/* Returns a Python string object holding the UTF-32 encoded value of
1228   the Unicode data.
1229
1230   If byteorder is not 0, output is written according to the following
1231   byte order:
1232
1233   byteorder == -1: little endian
1234   byteorder == 0:  native byte order (writes a BOM mark)
1235   byteorder == 1:  big endian
1236
1237   If byteorder is 0, the output string will always start with the
1238   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1239   prepended.
1240
1241*/
1242
1243#ifndef Py_LIMITED_API
1244PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
1245    const Py_UNICODE *data,     /* Unicode char buffer */
1246    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1247    const char *errors,         /* error handling */
1248    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1249    );
1250PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1251    PyObject *object,           /* Unicode object */
1252    const char *errors,         /* error handling */
1253    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1254    );
1255#endif
1256
1257/* --- UTF-16 Codecs ------------------------------------------------------ */
1258
1259/* Decodes length bytes from a UTF-16 encoded buffer string and returns
1260   the corresponding Unicode object.
1261
1262   errors (if non-NULL) defines the error handling. It defaults
1263   to "strict".
1264
1265   If byteorder is non-NULL, the decoder starts decoding using the
1266   given byte order:
1267
1268    *byteorder == -1: little endian
1269    *byteorder == 0:  native order
1270    *byteorder == 1:  big endian
1271
1272   In native mode, the first two bytes of the stream are checked for a
1273   BOM mark. If found, the BOM mark is analysed, the byte order
1274   adjusted and the BOM skipped.  In the other modes, no BOM mark
1275   interpretation is done. After completion, *byteorder is set to the
1276   current byte order at the end of input data.
1277
1278   If byteorder is NULL, the codec starts in native order mode.
1279
1280*/
1281
1282PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
1283    const char *string,         /* UTF-16 encoded string */
1284    Py_ssize_t length,          /* size of string */
1285    const char *errors,         /* error handling */
1286    int *byteorder              /* pointer to byteorder to use
1287                                   0=native;-1=LE,1=BE; updated on
1288                                   exit */
1289    );
1290
1291PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
1292    const char *string,         /* UTF-16 encoded string */
1293    Py_ssize_t length,          /* size of string */
1294    const char *errors,         /* error handling */
1295    int *byteorder,             /* pointer to byteorder to use
1296                                   0=native;-1=LE,1=BE; updated on
1297                                   exit */
1298    Py_ssize_t *consumed        /* bytes consumed */
1299    );
1300
1301/* Returns a Python string using the UTF-16 encoding in native byte
1302   order. The string always starts with a BOM mark.  */
1303
1304PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
1305    PyObject *unicode           /* Unicode object */
1306    );
1307
1308/* Returns a Python string object holding the UTF-16 encoded value of
1309   the Unicode data.
1310
1311   If byteorder is not 0, output is written according to the following
1312   byte order:
1313
1314   byteorder == -1: little endian
1315   byteorder == 0:  native byte order (writes a BOM mark)
1316   byteorder == 1:  big endian
1317
1318   If byteorder is 0, the output string will always start with the
1319   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1320   prepended.
1321
1322   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1323   UCS-2. This trick makes it possible to add full UTF-16 capabilities
1324   at a later point without compromising the APIs.
1325
1326*/
1327
1328#ifndef Py_LIMITED_API
1329PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1330    const Py_UNICODE *data,     /* Unicode char buffer */
1331    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1332    const char *errors,         /* error handling */
1333    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1334    );
1335PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1336    PyObject* unicode,          /* Unicode object */
1337    const char *errors,         /* error handling */
1338    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1339    );
1340#endif
1341
1342/* --- Unicode-Escape Codecs ---------------------------------------------- */
1343
1344PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1345    const char *string,         /* Unicode-Escape encoded string */
1346    Py_ssize_t length,          /* size of string */
1347    const char *errors          /* error handling */
1348    );
1349
1350PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1351    PyObject *unicode           /* Unicode object */
1352    );
1353
1354#ifndef Py_LIMITED_API
1355PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1356    const Py_UNICODE *data,     /* Unicode char buffer */
1357    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1358    );
1359#endif
1360
1361/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1362
1363PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1364    const char *string,         /* Raw-Unicode-Escape encoded string */
1365    Py_ssize_t length,          /* size of string */
1366    const char *errors          /* error handling */
1367    );
1368
1369PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1370    PyObject *unicode           /* Unicode object */
1371    );
1372
1373#ifndef Py_LIMITED_API
1374PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1375    const Py_UNICODE *data,     /* Unicode char buffer */
1376    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1377    );
1378#endif
1379
1380/* --- Unicode Internal Codec ---------------------------------------------
1381
1382    Only for internal use in _codecsmodule.c */
1383
1384#ifndef Py_LIMITED_API
1385PyObject *_PyUnicode_DecodeUnicodeInternal(
1386    const char *string,
1387    Py_ssize_t length,
1388    const char *errors
1389    );
1390#endif
1391
1392/* --- Latin-1 Codecs -----------------------------------------------------
1393
1394   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1395
1396*/
1397
1398PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1399    const char *string,         /* Latin-1 encoded string */
1400    Py_ssize_t length,          /* size of string */
1401    const char *errors          /* error handling */
1402    );
1403
1404PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1405    PyObject *unicode           /* Unicode object */
1406    );
1407
1408#ifndef Py_LIMITED_API
1409PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1410    PyObject* unicode,
1411    const char* errors);
1412
1413PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1414    const Py_UNICODE *data,     /* Unicode char buffer */
1415    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1416    const char *errors          /* error handling */
1417    );
1418#endif
1419
1420/* --- ASCII Codecs -------------------------------------------------------
1421
1422   Only 7-bit ASCII data is excepted. All other codes generate errors.
1423
1424*/
1425
1426PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1427    const char *string,         /* ASCII encoded string */
1428    Py_ssize_t length,          /* size of string */
1429    const char *errors          /* error handling */
1430    );
1431
1432PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1433    PyObject *unicode           /* Unicode object */
1434    );
1435
1436#ifndef Py_LIMITED_API
1437PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1438    PyObject* unicode,
1439    const char* errors);
1440
1441PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1442    const Py_UNICODE *data,     /* Unicode char buffer */
1443    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1444    const char *errors          /* error handling */
1445    );
1446#endif
1447
1448/* --- Character Map Codecs -----------------------------------------------
1449
1450   This codec uses mappings to encode and decode characters.
1451
1452   Decoding mappings must map single string characters to single
1453   Unicode characters, integers (which are then interpreted as Unicode
1454   ordinals) or None (meaning "undefined mapping" and causing an
1455   error).
1456
1457   Encoding mappings must map single Unicode characters to single
1458   string characters, integers (which are then interpreted as Latin-1
1459   ordinals) or None (meaning "undefined mapping" and causing an
1460   error).
1461
1462   If a character lookup fails with a LookupError, the character is
1463   copied as-is meaning that its ordinal value will be interpreted as
1464   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1465   to contain those mappings which map characters to different code
1466   points.
1467
1468*/
1469
1470PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1471    const char *string,         /* Encoded string */
1472    Py_ssize_t length,          /* size of string */
1473    PyObject *mapping,          /* character mapping
1474                                   (char ordinal -> unicode ordinal) */
1475    const char *errors          /* error handling */
1476    );
1477
1478PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1479    PyObject *unicode,          /* Unicode object */
1480    PyObject *mapping           /* character mapping
1481                                   (unicode ordinal -> char ordinal) */
1482    );
1483
1484#ifndef Py_LIMITED_API
1485PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1486    const Py_UNICODE *data,     /* Unicode char buffer */
1487    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1488    PyObject *mapping,          /* character mapping
1489                                   (unicode ordinal -> char ordinal) */
1490    const char *errors          /* error handling */
1491    );
1492PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1493    PyObject *unicode,          /* Unicode object */
1494    PyObject *mapping,          /* character mapping
1495                                   (unicode ordinal -> char ordinal) */
1496    const char *errors          /* error handling */
1497    );
1498#endif
1499
1500/* Translate a Py_UNICODE buffer of the given length by applying a
1501   character mapping table to it and return the resulting Unicode
1502   object.
1503
1504   The mapping table must map Unicode ordinal integers to Unicode
1505   ordinal integers or None (causing deletion of the character).
1506
1507   Mapping tables may be dictionaries or sequences. Unmapped character
1508   ordinals (ones which cause a LookupError) are left untouched and
1509   are copied as-is.
1510
1511*/
1512
1513#ifndef Py_LIMITED_API
1514PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1515    const Py_UNICODE *data,     /* Unicode char buffer */
1516    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1517    PyObject *table,            /* Translate table */
1518    const char *errors          /* error handling */
1519    );
1520#endif
1521
1522#ifdef HAVE_MBCS
1523
1524/* --- MBCS codecs for Windows -------------------------------------------- */
1525
1526PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1527    const char *string,         /* MBCS encoded string */
1528    Py_ssize_t length,              /* size of string */
1529    const char *errors          /* error handling */
1530    );
1531
1532PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1533    const char *string,         /* MBCS encoded string */
1534    Py_ssize_t length,          /* size of string */
1535    const char *errors,         /* error handling */
1536    Py_ssize_t *consumed        /* bytes consumed */
1537    );
1538
1539PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1540    int code_page,              /* code page number */
1541    const char *string,         /* encoded string */
1542    Py_ssize_t length,          /* size of string */
1543    const char *errors,         /* error handling */
1544    Py_ssize_t *consumed        /* bytes consumed */
1545    );
1546
1547PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1548    PyObject *unicode           /* Unicode object */
1549    );
1550
1551#ifndef Py_LIMITED_API
1552PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1553    const Py_UNICODE *data,     /* Unicode char buffer */
1554    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1555    const char *errors          /* error handling */
1556    );
1557#endif
1558
1559PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1560    int code_page,              /* code page number */
1561    PyObject *unicode,          /* Unicode object */
1562    const char *errors          /* error handling */
1563    );
1564
1565#endif /* HAVE_MBCS */
1566
1567/* --- Decimal Encoder ---------------------------------------------------- */
1568
1569/* Takes a Unicode string holding a decimal value and writes it into
1570   an output buffer using standard ASCII digit codes.
1571
1572   The output buffer has to provide at least length+1 bytes of storage
1573   area. The output string is 0-terminated.
1574
1575   The encoder converts whitespace to ' ', decimal characters to their
1576   corresponding ASCII digit and all other Latin-1 characters except
1577   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1578   are treated as errors. This includes embedded NULL bytes.
1579
1580   Error handling is defined by the errors argument:
1581
1582      NULL or "strict": raise a ValueError
1583      "ignore": ignore the wrong characters (these are not copied to the
1584                output buffer)
1585      "replace": replaces illegal characters with '?'
1586
1587   Returns 0 on success, -1 on failure.
1588
1589*/
1590
1591#ifndef Py_LIMITED_API
1592PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1593    Py_UNICODE *s,              /* Unicode buffer */
1594    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1595    char *output,               /* Output buffer; must have size >= length */
1596    const char *errors          /* error handling */
1597    );
1598#endif
1599
1600/* Transforms code points that have decimal digit property to the
1601   corresponding ASCII digit code points.
1602
1603   Returns a new Unicode string on success, NULL on failure.
1604*/
1605
1606#ifndef Py_LIMITED_API
1607PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1608    Py_UNICODE *s,              /* Unicode buffer */
1609    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
1610    );
1611#endif
1612
1613/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
1614   as argument instead of a raw buffer and length.  This function additionally
1615   transforms spaces to ASCII because this is what the callers in longobject,
1616   floatobject, and complexobject did anyways. */
1617
1618#ifndef Py_LIMITED_API
1619PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1620    PyObject *unicode           /* Unicode object */
1621    );
1622#endif
1623
1624/* --- Locale encoding --------------------------------------------------- */
1625
1626/* Decode a string from the current locale encoding. The decoder is strict if
1627   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1628   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1629   be decoded as a surrogate character and *surrogateescape* is not equal to
1630   zero, the byte sequence is escaped using the 'surrogateescape' error handler
1631   instead of being decoded. *str* must end with a null character but cannot
1632   contain embedded null characters. */
1633
1634PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1635    const char *str,
1636    Py_ssize_t len,
1637    const char *errors);
1638
1639/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1640   length using strlen(). */
1641
1642PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1643    const char *str,
1644    const char *errors);
1645
1646/* Encode a Unicode object to the current locale encoding. The encoder is
1647   strict is *surrogateescape* is equal to zero, otherwise the
1648   "surrogateescape" error handler is used. Return a bytes object. The string
1649   cannot contain embedded null characters.. */
1650
1651PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1652    PyObject *unicode,
1653    const char *errors
1654    );
1655
1656/* --- File system encoding ---------------------------------------------- */
1657
1658/* ParseTuple converter: encode str objects to bytes using
1659   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1660
1661PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1662
1663/* ParseTuple converter: decode bytes objects to unicode using
1664   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1665
1666PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1667
1668/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1669   and the "surrogateescape" error handler.
1670
1671   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1672   encoding.
1673
1674   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1675*/
1676
1677PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1678    const char *s               /* encoded string */
1679    );
1680
1681/* Decode a string using Py_FileSystemDefaultEncoding
1682   and the "surrogateescape" error handler.
1683
1684   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1685   encoding.
1686*/
1687
1688PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1689    const char *s,               /* encoded string */
1690    Py_ssize_t size              /* size */
1691    );
1692
1693/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1694   "surrogateescape" error handler, and return bytes.
1695
1696   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1697   encoding.
1698*/
1699
1700PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1701    PyObject *unicode
1702    );
1703
1704/* --- Methods & Slots ----------------------------------------------------
1705
1706   These are capable of handling Unicode objects and strings on input
1707   (we refer to them as strings in the descriptions) and return
1708   Unicode objects or integers as appropriate. */
1709
1710/* Concat two strings giving a new Unicode string. */
1711
1712PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1713    PyObject *left,             /* Left string */
1714    PyObject *right             /* Right string */
1715    );
1716
1717/* Concat two strings and put the result in *pleft
1718   (sets *pleft to NULL on error) */
1719
1720PyAPI_FUNC(void) PyUnicode_Append(
1721    PyObject **pleft,           /* Pointer to left string */
1722    PyObject *right             /* Right string */
1723    );
1724
1725/* Concat two strings, put the result in *pleft and drop the right object
1726   (sets *pleft to NULL on error) */
1727
1728PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1729    PyObject **pleft,           /* Pointer to left string */
1730    PyObject *right             /* Right string */
1731    );
1732
1733/* Split a string giving a list of Unicode strings.
1734
1735   If sep is NULL, splitting will be done at all whitespace
1736   substrings. Otherwise, splits occur at the given separator.
1737
1738   At most maxsplit splits will be done. If negative, no limit is set.
1739
1740   Separators are not included in the resulting list.
1741
1742*/
1743
1744PyAPI_FUNC(PyObject*) PyUnicode_Split(
1745    PyObject *s,                /* String to split */
1746    PyObject *sep,              /* String separator */
1747    Py_ssize_t maxsplit         /* Maxsplit count */
1748    );
1749
1750/* Dito, but split at line breaks.
1751
1752   CRLF is considered to be one line break. Line breaks are not
1753   included in the resulting list. */
1754
1755PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1756    PyObject *s,                /* String to split */
1757    int keepends                /* If true, line end markers are included */
1758    );
1759
1760/* Partition a string using a given separator. */
1761
1762PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1763    PyObject *s,                /* String to partition */
1764    PyObject *sep               /* String separator */
1765    );
1766
1767/* Partition a string using a given separator, searching from the end of the
1768   string. */
1769
1770PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1771    PyObject *s,                /* String to partition */
1772    PyObject *sep               /* String separator */
1773    );
1774
1775/* Split a string giving a list of Unicode strings.
1776
1777   If sep is NULL, splitting will be done at all whitespace
1778   substrings. Otherwise, splits occur at the given separator.
1779
1780   At most maxsplit splits will be done. But unlike PyUnicode_Split
1781   PyUnicode_RSplit splits from the end of the string. If negative,
1782   no limit is set.
1783
1784   Separators are not included in the resulting list.
1785
1786*/
1787
1788PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1789    PyObject *s,                /* String to split */
1790    PyObject *sep,              /* String separator */
1791    Py_ssize_t maxsplit         /* Maxsplit count */
1792    );
1793
1794/* Translate a string by applying a character mapping table to it and
1795   return the resulting Unicode object.
1796
1797   The mapping table must map Unicode ordinal integers to Unicode
1798   ordinal integers or None (causing deletion of the character).
1799
1800   Mapping tables may be dictionaries or sequences. Unmapped character
1801   ordinals (ones which cause a LookupError) are left untouched and
1802   are copied as-is.
1803
1804*/
1805
1806PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1807    PyObject *str,              /* String */
1808    PyObject *table,            /* Translate table */
1809    const char *errors          /* error handling */
1810    );
1811
1812/* Join a sequence of strings using the given separator and return
1813   the resulting Unicode string. */
1814
1815PyAPI_FUNC(PyObject*) PyUnicode_Join(
1816    PyObject *separator,        /* Separator string */
1817    PyObject *seq               /* Sequence object */
1818    );
1819
1820/* Return 1 if substr matches str[start:end] at the given tail end, 0
1821   otherwise. */
1822
1823PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1824    PyObject *str,              /* String */
1825    PyObject *substr,           /* Prefix or Suffix string */
1826    Py_ssize_t start,           /* Start index */
1827    Py_ssize_t end,             /* Stop index */
1828    int direction               /* Tail end: -1 prefix, +1 suffix */
1829    );
1830
1831/* Return the first position of substr in str[start:end] using the
1832   given search direction or -1 if not found. -2 is returned in case
1833   an error occurred and an exception is set. */
1834
1835PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1836    PyObject *str,              /* String */
1837    PyObject *substr,           /* Substring to find */
1838    Py_ssize_t start,           /* Start index */
1839    Py_ssize_t end,             /* Stop index */
1840    int direction               /* Find direction: +1 forward, -1 backward */
1841    );
1842
1843/* Like PyUnicode_Find, but search for single character only. */
1844PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1845    PyObject *str,
1846    Py_UCS4 ch,
1847    Py_ssize_t start,
1848    Py_ssize_t end,
1849    int direction
1850    );
1851
1852/* Count the number of occurrences of substr in str[start:end]. */
1853
1854PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1855    PyObject *str,              /* String */
1856    PyObject *substr,           /* Substring to count */
1857    Py_ssize_t start,           /* Start index */
1858    Py_ssize_t end              /* Stop index */
1859    );
1860
1861/* Replace at most maxcount occurrences of substr in str with replstr
1862   and return the resulting Unicode object. */
1863
1864PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1865    PyObject *str,              /* String */
1866    PyObject *substr,           /* Substring to find */
1867    PyObject *replstr,          /* Substring to replace */
1868    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1869                                   -1 = all */
1870    );
1871
1872/* Compare two strings and return -1, 0, 1 for less than, equal,
1873   greater than resp. */
1874
1875PyAPI_FUNC(int) PyUnicode_Compare(
1876    PyObject *left,             /* Left string */
1877    PyObject *right             /* Right string */
1878    );
1879
1880PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1881    PyObject *left,
1882    const char *right           /* ASCII-encoded string */
1883    );
1884
1885/* Rich compare two strings and return one of the following:
1886
1887   - NULL in case an exception was raised
1888   - Py_True or Py_False for successfully comparisons
1889   - Py_NotImplemented in case the type combination is unknown
1890
1891   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1892   case the conversion of the arguments to Unicode fails with a
1893   UnicodeDecodeError.
1894
1895   Possible values for op:
1896
1897     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1898
1899*/
1900
1901PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1902    PyObject *left,             /* Left string */
1903    PyObject *right,            /* Right string */
1904    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1905    );
1906
1907/* Apply a argument tuple or dictionary to a format string and return
1908   the resulting Unicode string. */
1909
1910PyAPI_FUNC(PyObject *) PyUnicode_Format(
1911    PyObject *format,           /* Format string */
1912    PyObject *args              /* Argument tuple or dictionary */
1913    );
1914
1915/* Checks whether element is contained in container and return 1/0
1916   accordingly.
1917
1918   element has to coerce to an one element Unicode string. -1 is
1919   returned in case of an error. */
1920
1921PyAPI_FUNC(int) PyUnicode_Contains(
1922    PyObject *container,        /* Container string */
1923    PyObject *element           /* Element string */
1924    );
1925
1926/* Checks whether the string contains any NUL characters. */
1927
1928#ifndef Py_LIMITED_API
1929PyAPI_FUNC(int) _PyUnicode_HasNULChars(PyObject *);
1930#endif
1931
1932/* Checks whether argument is a valid identifier. */
1933
1934PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1935
1936#ifndef Py_LIMITED_API
1937/* Externally visible for str.strip(unicode) */
1938PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1939    PyObject *self,
1940    int striptype,
1941    PyObject *sepobj
1942    );
1943#endif
1944
1945/* Using explicit passed-in values, insert the thousands grouping
1946   into the string pointed to by buffer.  For the argument descriptions,
1947   see Objects/stringlib/localeutil.h */
1948#ifndef Py_LIMITED_API
1949PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1950    PyObject *unicode,
1951    Py_ssize_t index,
1952    Py_ssize_t n_buffer,
1953    void *digits,
1954    Py_ssize_t n_digits,
1955    Py_ssize_t min_width,
1956    const char *grouping,
1957    PyObject *thousands_sep,
1958    Py_UCS4 *maxchar);
1959#endif
1960/* === Characters Type APIs =============================================== */
1961
1962/* Helper array used by Py_UNICODE_ISSPACE(). */
1963
1964#ifndef Py_LIMITED_API
1965PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1966
1967/* These should not be used directly. Use the Py_UNICODE_IS* and
1968   Py_UNICODE_TO* macros instead.
1969
1970   These APIs are implemented in Objects/unicodectype.c.
1971
1972*/
1973
1974PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1975    Py_UCS4 ch       /* Unicode character */
1976    );
1977
1978PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1979    Py_UCS4 ch       /* Unicode character */
1980    );
1981
1982PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1983    Py_UCS4 ch       /* Unicode character */
1984    );
1985
1986PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1987    Py_UCS4 ch       /* Unicode character */
1988    );
1989
1990PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1991    Py_UCS4 ch       /* Unicode character */
1992    );
1993
1994PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1995    const Py_UCS4 ch         /* Unicode character */
1996    );
1997
1998PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1999    const Py_UCS4 ch         /* Unicode character */
2000    );
2001
2002PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2003    Py_UCS4 ch       /* Unicode character */
2004    );
2005
2006PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2007    Py_UCS4 ch       /* Unicode character */
2008    );
2009
2010PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2011    Py_UCS4 ch       /* Unicode character */
2012    );
2013
2014PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2015    Py_UCS4 ch,       /* Unicode character */
2016    Py_UCS4 *res
2017    );
2018
2019PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2020    Py_UCS4 ch,       /* Unicode character */
2021    Py_UCS4 *res
2022    );
2023
2024PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2025    Py_UCS4 ch,       /* Unicode character */
2026    Py_UCS4 *res
2027    );
2028
2029PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2030    Py_UCS4 ch,       /* Unicode character */
2031    Py_UCS4 *res
2032    );
2033
2034PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
2035    Py_UCS4 ch         /* Unicode character */
2036    );
2037
2038PyAPI_FUNC(int) _PyUnicode_IsCased(
2039    Py_UCS4 ch         /* Unicode character */
2040    );
2041
2042PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
2043    Py_UCS4 ch       /* Unicode character */
2044    );
2045
2046PyAPI_FUNC(int) _PyUnicode_ToDigit(
2047    Py_UCS4 ch       /* Unicode character */
2048    );
2049
2050PyAPI_FUNC(double) _PyUnicode_ToNumeric(
2051    Py_UCS4 ch       /* Unicode character */
2052    );
2053
2054PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
2055    Py_UCS4 ch       /* Unicode character */
2056    );
2057
2058PyAPI_FUNC(int) _PyUnicode_IsDigit(
2059    Py_UCS4 ch       /* Unicode character */
2060    );
2061
2062PyAPI_FUNC(int) _PyUnicode_IsNumeric(
2063    Py_UCS4 ch       /* Unicode character */
2064    );
2065
2066PyAPI_FUNC(int) _PyUnicode_IsPrintable(
2067    Py_UCS4 ch       /* Unicode character */
2068    );
2069
2070PyAPI_FUNC(int) _PyUnicode_IsAlpha(
2071    Py_UCS4 ch       /* Unicode character */
2072    );
2073
2074PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2075    const Py_UNICODE *u
2076    );
2077
2078PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
2079    Py_UNICODE *s1,
2080    const Py_UNICODE *s2);
2081
2082PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2083    Py_UNICODE *s1, const Py_UNICODE *s2);
2084
2085PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
2086    Py_UNICODE *s1,
2087    const Py_UNICODE *s2,
2088    size_t n);
2089
2090PyAPI_FUNC(int) Py_UNICODE_strcmp(
2091    const Py_UNICODE *s1,
2092    const Py_UNICODE *s2
2093    );
2094
2095PyAPI_FUNC(int) Py_UNICODE_strncmp(
2096    const Py_UNICODE *s1,
2097    const Py_UNICODE *s2,
2098    size_t n
2099    );
2100
2101PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
2102    const Py_UNICODE *s,
2103    Py_UNICODE c
2104    );
2105
2106PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
2107    const Py_UNICODE *s,
2108    Py_UNICODE c
2109    );
2110
2111/* Create a copy of a unicode string ending with a nul character. Return NULL
2112   and raise a MemoryError exception on memory allocation failure, otherwise
2113   return a new allocated buffer (use PyMem_Free() to free the buffer). */
2114
2115PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
2116    PyObject *unicode
2117    );
2118#endif /* Py_LIMITED_API */
2119
2120#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2121PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2122    PyObject *op,
2123    int check_content);
2124#endif
2125
2126/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2127PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2128/* Clear all static strings. */
2129PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2130
2131#ifdef __cplusplus
2132}
2133#endif
2134#endif /* !Py_UNICODEOBJECT_H */
2135