unicodeobject.h revision 080a2c087e5fa08c44ff121d74ea8ad9d4413c58
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75   for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Py_UNICODE was the native Unicode storage format (code unit) used by
87   Python and represents a single Unicode element in the Unicode type.
88   With PEP 393, Py_UNICODE is deprecated and replaced with a
89   typedef to wchar_t. */
90
91#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97   through the interface functions PyUnicode_FromWideChar(),
98   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
101# ifndef HAVE_WCHAR_H
102#  define HAVE_WCHAR_H
103# endif
104#endif
105
106#if defined(MS_WINDOWS)
107#  define HAVE_MBCS
108#endif
109
110#ifdef HAVE_WCHAR_H
111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113#  include <time.h>
114# endif
115#  include <wchar.h>
116#endif
117
118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
119   unicode representations. */
120#if SIZEOF_INT == 4
121typedef unsigned int Py_UCS4;
122#elif SIZEOF_LONG == 4
123typedef unsigned long Py_UCS4;
124#else
125#error "Could not find a proper typedef for Py_UCS4"
126#endif
127
128#if SIZEOF_SHORT == 2
129typedef unsigned short Py_UCS2;
130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
134typedef unsigned char Py_UCS1;
135
136/* --- Internal Unicode Operations ---------------------------------------- */
137
138/* Since splitting on whitespace is an important use case, and
139   whitespace in most situations is solely ASCII whitespace, we
140   optimize for the common case by using a quick look-up table
141   _Py_ascii_whitespace (see below) with an inlined check.
142
143 */
144#ifndef Py_LIMITED_API
145#define Py_UNICODE_ISSPACE(ch) \
146    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
167
168#define Py_UNICODE_ISALNUM(ch) \
169       (Py_UNICODE_ISALPHA(ch) || \
170    Py_UNICODE_ISDECIMAL(ch) || \
171    Py_UNICODE_ISDIGIT(ch) || \
172    Py_UNICODE_ISNUMERIC(ch))
173
174#define Py_UNICODE_COPY(target, source, length) \
175    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
176
177#define Py_UNICODE_FILL(target, value, length) \
178    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
179    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
180    } while (0)
181
182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low)  \
188    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
189      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
190/* high surrogate = top 10 bits added to D800 */
191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
192/* low surrogate = bottom 10 bits added to DC00 */
193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
194
195/* Check if substring matches at given offset.  The offset must be
196   valid, and the substring must not be empty. */
197
198#define Py_UNICODE_MATCH(string, offset, substring) \
199    ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200     ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201     !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
203#endif /* Py_LIMITED_API */
204
205#ifdef __cplusplus
206extern "C" {
207#endif
208
209/* --- Unicode Type ------------------------------------------------------- */
210
211#ifndef Py_LIMITED_API
212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214   structure. state.ascii and state.compact are set, and the data
215   immediately follow the structure. utf8_length and wstr_length can be found
216   in the length field; the utf8 pointer is equal to the data pointer. */
217typedef struct {
218    /* There are 4 forms of Unicode strings:
219
220       - compact ascii:
221
222         * structure = PyASCIIObject
223         * test: PyUnicode_IS_COMPACT_ASCII(op)
224         * kind = PyUnicode_1BYTE_KIND
225         * compact = 1
226         * ascii = 1
227         * ready = 1
228         * (length is the length of the utf8 and wstr strings)
229         * (data starts just after the structure)
230         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
231
232       - compact:
233
234         * structure = PyCompactUnicodeObject
235         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
236         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
237           PyUnicode_4BYTE_KIND
238         * compact = 1
239         * ready = 1
240         * ascii = 0
241         * utf8 is not shared with data
242         * utf8_length = 0 if utf8 is NULL
243         * wstr is shared with data and wstr_length=length
244           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
245           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
246         * wstr_length = 0 if wstr is NULL
247         * (data starts just after the structure)
248
249       - legacy string, not ready:
250
251         * structure = PyUnicodeObject
252         * test: kind == PyUnicode_WCHAR_KIND
253         * length = 0 (use wstr_length)
254         * hash = -1
255         * kind = PyUnicode_WCHAR_KIND
256         * compact = 0
257         * ascii = 0
258         * ready = 0
259         * interned = SSTATE_NOT_INTERNED
260         * wstr is not NULL
261         * data.any is NULL
262         * utf8 is NULL
263         * utf8_length = 0
264
265       - legacy string, ready:
266
267         * structure = PyUnicodeObject structure
268         * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
269         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
270           PyUnicode_4BYTE_KIND
271         * compact = 0
272         * ready = 1
273         * data.any is not NULL
274         * utf8 is shared and utf8_length = length with data.any if ascii = 1
275         * utf8_length = 0 if utf8 is NULL
276         * wstr is shared with data.any and wstr_length = length
277           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
278           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
279         * wstr_length = 0 if wstr is NULL
280
281       Compact strings use only one memory block (structure + characters),
282       whereas legacy strings use one block for the structure and one block
283       for characters.
284
285       Legacy strings are created by PyUnicode_FromUnicode() and
286       PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
287       when PyUnicode_READY() is called.
288
289       See also _PyUnicode_CheckConsistency().
290    */
291    PyObject_HEAD
292    Py_ssize_t length;          /* Number of code points in the string */
293    Py_hash_t hash;             /* Hash value; -1 if not set */
294    struct {
295        /*
296           SSTATE_NOT_INTERNED (0)
297           SSTATE_INTERNED_MORTAL (1)
298           SSTATE_INTERNED_IMMORTAL (2)
299
300           If interned != SSTATE_NOT_INTERNED, the two references from the
301           dictionary to this object are *not* counted in ob_refcnt.
302         */
303        unsigned int interned:2;
304        /* Character size:
305
306           - PyUnicode_WCHAR_KIND (0):
307
308             * character type = wchar_t (16 or 32 bits, depending on the
309               platform)
310
311           - PyUnicode_1BYTE_KIND (1):
312
313             * character type = Py_UCS1 (8 bits, unsigned)
314             * all characters are in the range U+0000-U+00FF (latin1)
315             * if ascii is set, all characters are in the range U+0000-U+007F
316               (ASCII), otherwise at least one character is in the range
317               U+0080-U+00FF
318
319           - PyUnicode_2BYTE_KIND (2):
320
321             * character type = Py_UCS2 (16 bits, unsigned)
322             * all characters are in the range U+0000-U+FFFF (BMP)
323             * at least one character is in the range U+0100-U+FFFF
324
325           - PyUnicode_4BYTE_KIND (4):
326
327             * character type = Py_UCS4 (32 bits, unsigned)
328             * all characters are in the range U+0000-U+10FFFF
329             * at least one character is in the range U+10000-U+10FFFF
330         */
331        unsigned int kind:3;
332        /* Compact is with respect to the allocation scheme. Compact unicode
333           objects only require one memory block while non-compact objects use
334           one block for the PyUnicodeObject struct and another for its data
335           buffer. */
336        unsigned int compact:1;
337        /* The string only contains characters in the range U+0000-U+007F (ASCII)
338           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
339           set, use the PyASCIIObject structure. */
340        unsigned int ascii:1;
341        /* The ready flag indicates whether the object layout is initialized
342           completely. This means that this is either a compact object, or
343           the data pointer is filled out. The bit is redundant, and helps
344           to minimize the test in PyUnicode_IS_READY(). */
345        unsigned int ready:1;
346    } state;
347    wchar_t *wstr;              /* wchar_t representation (null-terminated) */
348} PyASCIIObject;
349
350/* Non-ASCII strings allocated through PyUnicode_New use the
351   PyCompactUnicodeObject structure. state.compact is set, and the data
352   immediately follow the structure. */
353typedef struct {
354    PyASCIIObject _base;
355    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
356                                 * terminating \0. */
357    char *utf8;                 /* UTF-8 representation (null-terminated) */
358    Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
359                                 * surrogates count as two code points. */
360} PyCompactUnicodeObject;
361
362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
363   PyUnicodeObject structure. The actual string data is initially in the wstr
364   block, and copied into the data block using _PyUnicode_Ready. */
365typedef struct {
366    PyCompactUnicodeObject _base;
367    union {
368        void *any;
369        Py_UCS1 *latin1;
370        Py_UCS2 *ucs2;
371        Py_UCS4 *ucs4;
372    } data;                     /* Canonical, smallest-form Unicode buffer */
373} PyUnicodeObject;
374#endif
375
376PyAPI_DATA(PyTypeObject) PyUnicode_Type;
377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
378
379#define PyUnicode_Check(op) \
380                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
382
383/* Fast access macros */
384#ifndef Py_LIMITED_API
385
386#define PyUnicode_WSTR_LENGTH(op) \
387    (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
388     ((PyASCIIObject*)op)->length :                    \
389     ((PyCompactUnicodeObject*)op)->wstr_length)
390
391/* Returns the deprecated Py_UNICODE representation's size in code units
392   (this includes surrogate pairs as 2 units).
393   If the Py_UNICODE representation is not available, it will be computed
394   on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
395
396#define PyUnicode_GET_SIZE(op)                       \
397    (assert(PyUnicode_Check(op)),                    \
398     (((PyASCIIObject *)(op))->wstr) ?               \
399      PyUnicode_WSTR_LENGTH(op) :                    \
400      ((void)PyUnicode_AsUnicode((PyObject *)(op)),  \
401       assert(((PyASCIIObject *)(op))->wstr),        \
402       PyUnicode_WSTR_LENGTH(op)))
403
404#define PyUnicode_GET_DATA_SIZE(op) \
405    (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
406
407/* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
408   representation on demand.  Using this macro is very inefficient now,
409   try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
410   use PyUnicode_WRITE() and PyUnicode_READ(). */
411
412#define PyUnicode_AS_UNICODE(op) \
413    (assert(PyUnicode_Check(op)), \
414     (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
415      PyUnicode_AsUnicode((PyObject *)(op)))
416
417#define PyUnicode_AS_DATA(op) \
418    ((const char *)(PyUnicode_AS_UNICODE(op)))
419
420
421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
422
423/* Values for PyASCIIObject.state: */
424
425/* Interning state. */
426#define SSTATE_NOT_INTERNED 0
427#define SSTATE_INTERNED_MORTAL 1
428#define SSTATE_INTERNED_IMMORTAL 2
429
430/* Return true if the string contains only ASCII characters, or 0 if not. The
431   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
432   ready. */
433#define PyUnicode_IS_ASCII(op)                   \
434    (assert(PyUnicode_Check(op)),                \
435     assert(PyUnicode_IS_READY(op)),             \
436     ((PyASCIIObject*)op)->state.ascii)
437
438/* Return true if the string is compact or 0 if not.
439   No type checks or Ready calls are performed. */
440#define PyUnicode_IS_COMPACT(op) \
441    (((PyASCIIObject*)(op))->state.compact)
442
443/* Return true if the string is a compact ASCII string (use PyASCIIObject
444   structure), or 0 if not.  No type checks or Ready calls are performed. */
445#define PyUnicode_IS_COMPACT_ASCII(op)                 \
446    (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
447
448enum PyUnicode_Kind {
449/* String contains only wstr byte characters.  This is only possible
450   when the string was created with a legacy API and _PyUnicode_Ready()
451   has not been called yet.  */
452    PyUnicode_WCHAR_KIND = 0,
453/* Return values of the PyUnicode_KIND() macro: */
454    PyUnicode_1BYTE_KIND = 1,
455    PyUnicode_2BYTE_KIND = 2,
456    PyUnicode_4BYTE_KIND = 4
457};
458
459/* Return pointers to the canonical representation cast to unsigned char,
460   Py_UCS2, or Py_UCS4 for direct character access.
461   No checks are performed, use PyUnicode_KIND() before to ensure
462   these will work correctly. */
463
464#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
465#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
466#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
467
468/* Return one of the PyUnicode_*_KIND values defined above. */
469#define PyUnicode_KIND(op) \
470    (assert(PyUnicode_Check(op)), \
471     assert(PyUnicode_IS_READY(op)),            \
472     ((PyASCIIObject *)(op))->state.kind)
473
474/* Return a void pointer to the raw unicode buffer. */
475#define _PyUnicode_COMPACT_DATA(op)                     \
476    (PyUnicode_IS_ASCII(op) ?                   \
477     ((void*)((PyASCIIObject*)(op) + 1)) :              \
478     ((void*)((PyCompactUnicodeObject*)(op) + 1)))
479
480#define _PyUnicode_NONCOMPACT_DATA(op)                  \
481    (assert(((PyUnicodeObject*)(op))->data.any),        \
482     ((((PyUnicodeObject *)(op))->data.any)))
483
484#define PyUnicode_DATA(op) \
485    (assert(PyUnicode_Check(op)), \
486     PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
487     _PyUnicode_NONCOMPACT_DATA(op))
488
489/* In the access macros below, "kind" may be evaluated more than once.
490   All other macro parameters are evaluated exactly once, so it is safe
491   to put side effects into them (such as increasing the index). */
492
493/* Write into the canonical representation, this macro does not do any sanity
494   checks and is intended for usage in loops.  The caller should cache the
495   kind and data pointers obtained from other macro calls.
496   index is the index in the string (starts at 0) and value is the new
497   code point value which should be written to that location. */
498#define PyUnicode_WRITE(kind, data, index, value) \
499    do { \
500        switch ((kind)) { \
501        case PyUnicode_1BYTE_KIND: { \
502            ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
503            break; \
504        } \
505        case PyUnicode_2BYTE_KIND: { \
506            ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
507            break; \
508        } \
509        default: { \
510            assert((kind) == PyUnicode_4BYTE_KIND); \
511            ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
512        } \
513        } \
514    } while (0)
515
516/* Read a code point from the string's canonical representation.  No checks
517   or ready calls are performed. */
518#define PyUnicode_READ(kind, data, index) \
519    ((Py_UCS4) \
520    ((kind) == PyUnicode_1BYTE_KIND ? \
521        ((const Py_UCS1 *)(data))[(index)] : \
522        ((kind) == PyUnicode_2BYTE_KIND ? \
523            ((const Py_UCS2 *)(data))[(index)] : \
524            ((const Py_UCS4 *)(data))[(index)] \
525        ) \
526    ))
527
528/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
529   calls PyUnicode_KIND() and might call it twice.  For single reads, use
530   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
531   cache kind and use PyUnicode_READ instead. */
532#define PyUnicode_READ_CHAR(unicode, index) \
533    (assert(PyUnicode_Check(unicode)),          \
534     assert(PyUnicode_IS_READY(unicode)),       \
535     (Py_UCS4)                                  \
536        (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
537            ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
538            (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
539                ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
540                ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
541            ) \
542        ))
543
544/* Returns the length of the unicode string. The caller has to make sure that
545   the string has it's canonical representation set before calling
546   this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
547#define PyUnicode_GET_LENGTH(op)                \
548    (assert(PyUnicode_Check(op)),               \
549     assert(PyUnicode_IS_READY(op)),            \
550     ((PyASCIIObject *)(op))->length)
551
552
553/* Fast check to determine whether an object is ready. Equivalent to
554   PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
555
556#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
557
558/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
559   case.  If the canonical representation is not yet set, it will still call
560   _PyUnicode_Ready().
561   Returns 0 on success and -1 on errors. */
562#define PyUnicode_READY(op)                        \
563    (assert(PyUnicode_Check(op)),                       \
564     (PyUnicode_IS_READY(op) ?                          \
565      0 : _PyUnicode_Ready((PyObject *)(op))))
566
567/* Return a maximum character value which is suitable for creating another
568   string based on op.  This is always an approximation but more efficient
569   than iterating over the string. */
570#define PyUnicode_MAX_CHAR_VALUE(op) \
571    (assert(PyUnicode_IS_READY(op)),                                    \
572     (PyUnicode_IS_ASCII(op) ?                                          \
573      (0x7f) :                                                          \
574      (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
575       (0xffU) :                                                        \
576       (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
577        (0xffffU) :                                                     \
578        (0x10ffffU)))))
579
580#endif
581
582/* --- Constants ---------------------------------------------------------- */
583
584/* This Unicode character will be used as replacement character during
585   decoding if the errors argument is set to "replace". Note: the
586   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
587   Unicode 3.0. */
588
589#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
590
591/* === Public API ========================================================= */
592
593/* --- Plain Py_UNICODE --------------------------------------------------- */
594
595/* With PEP 393, this is the recommended way to allocate a new unicode object.
596   This function will allocate the object and its buffer in a single memory
597   block.  Objects created using this function are not resizable. */
598#ifndef Py_LIMITED_API
599PyAPI_FUNC(PyObject*) PyUnicode_New(
600    Py_ssize_t size,            /* Number of code points in the new string */
601    Py_UCS4 maxchar             /* maximum code point value in the string */
602    );
603#endif
604
605/* Initializes the canonical string representation from a the deprecated
606   wstr/Py_UNICODE representation. This function is used to convert Unicode
607   objects which were created using the old API to the new flexible format
608   introduced with PEP 393.
609
610   Don't call this function directly, use the public PyUnicode_READY() macro
611   instead. */
612#ifndef Py_LIMITED_API
613PyAPI_FUNC(int) _PyUnicode_Ready(
614    PyObject *unicode           /* Unicode object */
615    );
616#endif
617
618/* Get a copy of a Unicode string. */
619#ifndef Py_LIMITED_API
620PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
621    PyObject *unicode
622    );
623#endif
624
625/* Copy character from one unicode object into another, this function performs
626   character conversion when necessary and falls back to memcpy() if possible.
627
628   Fail if to is too small (smaller than *how_many* or smaller than
629   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
630   kind(to), or if *to* has more than 1 reference.
631
632   Return the number of written character, or return -1 and raise an exception
633   on error.
634
635   Pseudo-code:
636
637       how_many = min(how_many, len(from) - from_start)
638       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
639       return how_many
640
641   Note: The function doesn't write a terminating null character.
642   */
643#ifndef Py_LIMITED_API
644PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
645    PyObject *to,
646    Py_ssize_t to_start,
647    PyObject *from,
648    Py_ssize_t from_start,
649    Py_ssize_t how_many
650    );
651
652/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
653   may crash if parameters are invalid (e.g. if the output string
654   is too short). */
655PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
656    PyObject *to,
657    Py_ssize_t to_start,
658    PyObject *from,
659    Py_ssize_t from_start,
660    Py_ssize_t how_many
661    );
662#endif
663
664#ifndef Py_LIMITED_API
665/* Fill a string with a character: write fill_char into
666   unicode[start:start+length].
667
668   Fail if fill_char is bigger than the string maximum character, or if the
669   string has more than 1 reference.
670
671   Return the number of written character, or return -1 and raise an exception
672   on error. */
673PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
674    PyObject *unicode,
675    Py_ssize_t start,
676    Py_ssize_t length,
677    Py_UCS4 fill_char
678    );
679
680/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
681   if parameters are invalid (e.g. if length is longer than the string). */
682PyAPI_FUNC(void) _PyUnicode_FastFill(
683    PyObject *unicode,
684    Py_ssize_t start,
685    Py_ssize_t length,
686    Py_UCS4 fill_char
687    );
688#endif
689
690/* Create a Unicode Object from the Py_UNICODE buffer u of the given
691   size.
692
693   u may be NULL which causes the contents to be undefined. It is the
694   user's responsibility to fill in the needed data afterwards. Note
695   that modifying the Unicode object contents after construction is
696   only allowed if u was set to NULL.
697
698   The buffer is copied into the new object. */
699
700#ifndef Py_LIMITED_API
701PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
702    const Py_UNICODE *u,        /* Unicode buffer */
703    Py_ssize_t size             /* size of buffer */
704    );
705#endif
706
707/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
708PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
709    const char *u,             /* UTF-8 encoded string */
710    Py_ssize_t size            /* size of buffer */
711    );
712
713/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
714   UTF-8 encoded bytes.  The size is determined with strlen(). */
715PyAPI_FUNC(PyObject*) PyUnicode_FromString(
716    const char *u              /* UTF-8 encoded string */
717    );
718
719#ifndef Py_LIMITED_API
720/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
721   Scan the string to find the maximum character. */
722PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
723    int kind,
724    const void *buffer,
725    Py_ssize_t size);
726
727/* Create a new string from a buffer of ASCII characters.
728   WARNING: Don't check if the string contains any non-ASCII character. */
729PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
730    const char *buffer,
731    Py_ssize_t size);
732#endif
733
734PyAPI_FUNC(PyObject*) PyUnicode_Substring(
735    PyObject *str,
736    Py_ssize_t start,
737    Py_ssize_t end);
738
739#ifndef Py_LIMITED_API
740/* Compute the maximum character of the substring unicode[start:end].
741   Return 127 for an empty string. */
742PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
743    PyObject *unicode,
744    Py_ssize_t start,
745    Py_ssize_t end);
746#endif
747
748/* Copy the string into a UCS4 buffer including the null character if copy_null
749   is set. Return NULL and raise an exception on error. Raise a ValueError if
750   the buffer is smaller than the string. Return buffer on success.
751
752   buflen is the length of the buffer in (Py_UCS4) characters. */
753PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
754    PyObject *unicode,
755    Py_UCS4* buffer,
756    Py_ssize_t buflen,
757    int copy_null);
758
759/* Copy the string into a UCS4 buffer. A new buffer is allocated using
760 * PyMem_Malloc; if this fails, NULL is returned with a memory error
761   exception set. */
762PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
763
764/* Return a read-only pointer to the Unicode object's internal
765   Py_UNICODE buffer.
766   If the wchar_t/Py_UNICODE representation is not yet available, this
767   function will calculate it. */
768
769#ifndef Py_LIMITED_API
770PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
771    PyObject *unicode           /* Unicode object */
772    );
773#endif
774
775/* Return a read-only pointer to the Unicode object's internal
776   Py_UNICODE buffer and save the length at size.
777   If the wchar_t/Py_UNICODE representation is not yet available, this
778   function will calculate it. */
779
780#ifndef Py_LIMITED_API
781PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
782    PyObject *unicode,          /* Unicode object */
783    Py_ssize_t *size            /* location where to save the length */
784    );
785#endif
786
787/* Get the length of the Unicode object. */
788
789PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
790    PyObject *unicode
791);
792
793/* Get the number of Py_UNICODE units in the
794   string representation. */
795
796PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
797    PyObject *unicode           /* Unicode object */
798    );
799
800/* Read a character from the string. */
801
802PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
803    PyObject *unicode,
804    Py_ssize_t index
805    );
806
807/* Write a character to the string. The string must have been created through
808   PyUnicode_New, must not be shared, and must not have been hashed yet.
809
810   Return 0 on success, -1 on error. */
811
812PyAPI_FUNC(int) PyUnicode_WriteChar(
813    PyObject *unicode,
814    Py_ssize_t index,
815    Py_UCS4 character
816    );
817
818#ifndef Py_LIMITED_API
819/* Get the maximum ordinal for a Unicode character. */
820PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
821#endif
822
823/* Resize an Unicode object. The length is the number of characters, except
824   if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
825   is the number of Py_UNICODE characters.
826
827   *unicode is modified to point to the new (resized) object and 0
828   returned on success.
829
830   Try to resize the string in place (which is usually faster than allocating
831   a new string and copy characters), or create a new string.
832
833   Error handling is implemented as follows: an exception is set, -1
834   is returned and *unicode left untouched.
835
836   WARNING: The function doesn't check string content, the result may not be a
837            string in canonical representation. */
838
839PyAPI_FUNC(int) PyUnicode_Resize(
840    PyObject **unicode,         /* Pointer to the Unicode object */
841    Py_ssize_t length           /* New length */
842    );
843
844/* Coerce obj to an Unicode object and return a reference with
845   *incremented* refcount.
846
847   Coercion is done in the following way:
848
849   1. bytes, bytearray and other char buffer compatible objects are decoded
850      under the assumptions that they contain data using the UTF-8
851      encoding. Decoding is done in "strict" mode.
852
853   2. All other objects (including Unicode objects) raise an
854      exception.
855
856   The API returns NULL in case of an error. The caller is responsible
857   for decref'ing the returned objects.
858
859*/
860
861PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
862    register PyObject *obj,     /* Object */
863    const char *encoding,       /* encoding */
864    const char *errors          /* error handling */
865    );
866
867/* Coerce obj to an Unicode object and return a reference with
868   *incremented* refcount.
869
870   Unicode objects are passed back as-is (subclasses are converted to
871   true Unicode objects), all other objects are delegated to
872   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
873   using UTF-8 encoding as basis for decoding the object.
874
875   The API returns NULL in case of an error. The caller is responsible
876   for decref'ing the returned objects.
877
878*/
879
880PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
881    register PyObject *obj      /* Object */
882    );
883
884PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
885    const char *format,   /* ASCII-encoded string  */
886    va_list vargs
887    );
888PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
889    const char *format,   /* ASCII-encoded string  */
890    ...
891    );
892
893#ifndef Py_LIMITED_API
894typedef struct {
895    PyObject *buffer;
896    void *data;
897    enum PyUnicode_Kind kind;
898    Py_UCS4 maxchar;
899    Py_ssize_t size;
900    Py_ssize_t pos;
901    /* minimum length of the buffer when overallocation is enabled,
902       see _PyUnicodeWriter_Init() */
903    Py_ssize_t min_length;
904    unsigned char overallocate;
905    /* If readonly is 1, buffer is a shared string (cannot be modified)
906       and size is set to 0. */
907    unsigned char readonly;
908} _PyUnicodeWriter ;
909
910/* Initialize a Unicode writer.
911
912   If min_length is greater than zero, _PyUnicodeWriter_Prepare()
913   overallocates the buffer and min_length is the minimum length in characters
914   of the buffer. */
915PyAPI_FUNC(void)
916_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length);
917
918/* Prepare the buffer to write 'length' characters
919   with the specified maximum character.
920
921   Return 0 on success, raise an exception and return -1 on error. */
922#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
923    (((MAXCHAR) <= (WRITER)->maxchar                                  \
924      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
925     ? 0                                                              \
926     : (((LENGTH) == 0)                                               \
927        ? 0                                                           \
928        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
929
930/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
931   instead. */
932PyAPI_FUNC(int)
933_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
934                                 Py_ssize_t length, Py_UCS4 maxchar);
935
936PyAPI_FUNC(int)
937_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str);
938
939PyAPI_FUNC(PyObject *)
940_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
941
942PyAPI_FUNC(void)
943_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
944#endif
945
946#ifndef Py_LIMITED_API
947/* Format the object based on the format_spec, as defined in PEP 3101
948   (Advanced String Formatting). */
949PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
950    _PyUnicodeWriter *writer,
951    PyObject *obj,
952    PyObject *format_spec,
953    Py_ssize_t start,
954    Py_ssize_t end);
955#endif
956
957PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
958PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
959PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
960    const char *u              /* UTF-8 encoded string */
961    );
962#ifndef Py_LIMITED_API
963PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
964#endif
965
966/* Use only if you know it's a string */
967#define PyUnicode_CHECK_INTERNED(op) \
968    (((PyASCIIObject *)(op))->state.interned)
969
970/* --- wchar_t support for platforms which support it --------------------- */
971
972#ifdef HAVE_WCHAR_H
973
974/* Create a Unicode Object from the wchar_t buffer w of the given
975   size.
976
977   The buffer is copied into the new object. */
978
979PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
980    register const wchar_t *w,  /* wchar_t buffer */
981    Py_ssize_t size             /* size of buffer */
982    );
983
984/* Copies the Unicode Object contents into the wchar_t buffer w.  At
985   most size wchar_t characters are copied.
986
987   Note that the resulting wchar_t string may or may not be
988   0-terminated.  It is the responsibility of the caller to make sure
989   that the wchar_t string is 0-terminated in case this is required by
990   the application.
991
992   Returns the number of wchar_t characters copied (excluding a
993   possibly trailing 0-termination character) or -1 in case of an
994   error. */
995
996PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
997    PyObject *unicode,          /* Unicode object */
998    register wchar_t *w,        /* wchar_t buffer */
999    Py_ssize_t size             /* size of buffer */
1000    );
1001
1002/* Convert the Unicode object to a wide character string. The output string
1003   always ends with a nul character. If size is not NULL, write the number of
1004   wide characters (excluding the null character) into *size.
1005
1006   Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
1007   on success. On error, returns NULL, *size is undefined and raises a
1008   MemoryError. */
1009
1010PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
1011    PyObject *unicode,          /* Unicode object */
1012    Py_ssize_t *size            /* number of characters of the result */
1013    );
1014
1015#ifndef Py_LIMITED_API
1016PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
1017#endif
1018
1019#endif
1020
1021/* --- Unicode ordinals --------------------------------------------------- */
1022
1023/* Create a Unicode Object from the given Unicode code point ordinal.
1024
1025   The ordinal must be in range(0x110000). A ValueError is
1026   raised in case it is not.
1027
1028*/
1029
1030PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
1031
1032/* --- Free-list management ----------------------------------------------- */
1033
1034/* Clear the free list used by the Unicode implementation.
1035
1036   This can be used to release memory used for objects on the free
1037   list back to the Python memory allocator.
1038
1039*/
1040
1041PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1042
1043/* === Builtin Codecs =====================================================
1044
1045   Many of these APIs take two arguments encoding and errors. These
1046   parameters encoding and errors have the same semantics as the ones
1047   of the builtin str() API.
1048
1049   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
1050
1051   Error handling is set by errors which may also be set to NULL
1052   meaning to use the default handling defined for the codec. Default
1053   error handling for all builtin codecs is "strict" (ValueErrors are
1054   raised).
1055
1056   The codecs all use a similar interface. Only deviation from the
1057   generic ones are documented.
1058
1059*/
1060
1061/* --- Manage the default encoding ---------------------------------------- */
1062
1063/* Returns a pointer to the default encoding (UTF-8) of the
1064   Unicode object unicode and the size of the encoded representation
1065   in bytes stored in *size.
1066
1067   In case of an error, no *size is set.
1068
1069   This function caches the UTF-8 encoded string in the unicodeobject
1070   and subsequent calls will return the same string.  The memory is released
1071   when the unicodeobject is deallocated.
1072
1073   _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1074   support the previous internal function with the same behaviour.
1075
1076   *** This API is for interpreter INTERNAL USE ONLY and will likely
1077   *** be removed or changed in the future.
1078
1079   *** If you need to access the Unicode object as UTF-8 bytes string,
1080   *** please use PyUnicode_AsUTF8String() instead.
1081*/
1082
1083#ifndef Py_LIMITED_API
1084PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
1085    PyObject *unicode,
1086    Py_ssize_t *size);
1087#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
1088#endif
1089
1090/* Returns a pointer to the default encoding (UTF-8) of the
1091   Unicode object unicode.
1092
1093   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1094   in the unicodeobject.
1095
1096   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1097   support the previous internal function with the same behaviour.
1098
1099   Use of this API is DEPRECATED since no size information can be
1100   extracted from the returned data.
1101
1102   *** This API is for interpreter INTERNAL USE ONLY and will likely
1103   *** be removed or changed for Python 3.1.
1104
1105   *** If you need to access the Unicode object as UTF-8 bytes string,
1106   *** please use PyUnicode_AsUTF8String() instead.
1107
1108*/
1109
1110#ifndef Py_LIMITED_API
1111PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1112#define _PyUnicode_AsString PyUnicode_AsUTF8
1113#endif
1114
1115/* Returns "utf-8".  */
1116
1117PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
1118
1119/* --- Generic Codecs ----------------------------------------------------- */
1120
1121/* Create a Unicode object by decoding the encoded string s of the
1122   given size. */
1123
1124PyAPI_FUNC(PyObject*) PyUnicode_Decode(
1125    const char *s,              /* encoded string */
1126    Py_ssize_t size,            /* size of buffer */
1127    const char *encoding,       /* encoding */
1128    const char *errors          /* error handling */
1129    );
1130
1131/* Decode a Unicode object unicode and return the result as Python
1132   object. */
1133
1134PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
1135    PyObject *unicode,          /* Unicode object */
1136    const char *encoding,       /* encoding */
1137    const char *errors          /* error handling */
1138    );
1139
1140/* Decode a Unicode object unicode and return the result as Unicode
1141   object. */
1142
1143PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
1144    PyObject *unicode,          /* Unicode object */
1145    const char *encoding,       /* encoding */
1146    const char *errors          /* error handling */
1147    );
1148
1149/* Encodes a Py_UNICODE buffer of the given size and returns a
1150   Python string object. */
1151
1152#ifndef Py_LIMITED_API
1153PyAPI_FUNC(PyObject*) PyUnicode_Encode(
1154    const Py_UNICODE *s,        /* Unicode char buffer */
1155    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
1156    const char *encoding,       /* encoding */
1157    const char *errors          /* error handling */
1158    );
1159#endif
1160
1161/* Encodes a Unicode object and returns the result as Python
1162   object. */
1163
1164PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
1165    PyObject *unicode,          /* Unicode object */
1166    const char *encoding,       /* encoding */
1167    const char *errors          /* error handling */
1168    );
1169
1170/* Encodes a Unicode object and returns the result as Python string
1171   object. */
1172
1173PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
1174    PyObject *unicode,          /* Unicode object */
1175    const char *encoding,       /* encoding */
1176    const char *errors          /* error handling */
1177    );
1178
1179/* Encodes a Unicode object and returns the result as Unicode
1180   object. */
1181
1182PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
1183    PyObject *unicode,          /* Unicode object */
1184    const char *encoding,       /* encoding */
1185    const char *errors          /* error handling */
1186    );
1187
1188/* Build an encoding map. */
1189
1190PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1191    PyObject* string            /* 256 character map */
1192   );
1193
1194/* --- UTF-7 Codecs ------------------------------------------------------- */
1195
1196PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
1197    const char *string,         /* UTF-7 encoded string */
1198    Py_ssize_t length,          /* size of string */
1199    const char *errors          /* error handling */
1200    );
1201
1202PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
1203    const char *string,         /* UTF-7 encoded string */
1204    Py_ssize_t length,          /* size of string */
1205    const char *errors,         /* error handling */
1206    Py_ssize_t *consumed        /* bytes consumed */
1207    );
1208
1209#ifndef Py_LIMITED_API
1210PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
1211    const Py_UNICODE *data,     /* Unicode char buffer */
1212    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1213    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1214    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1215    const char *errors          /* error handling */
1216    );
1217PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1218    PyObject *unicode,          /* Unicode object */
1219    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1220    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1221    const char *errors          /* error handling */
1222    );
1223#endif
1224
1225/* --- UTF-8 Codecs ------------------------------------------------------- */
1226
1227PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
1228    const char *string,         /* UTF-8 encoded string */
1229    Py_ssize_t length,          /* size of string */
1230    const char *errors          /* error handling */
1231    );
1232
1233PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
1234    const char *string,         /* UTF-8 encoded string */
1235    Py_ssize_t length,          /* size of string */
1236    const char *errors,         /* error handling */
1237    Py_ssize_t *consumed        /* bytes consumed */
1238    );
1239
1240PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
1241    PyObject *unicode           /* Unicode object */
1242    );
1243
1244#ifndef Py_LIMITED_API
1245PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1246    PyObject *unicode,
1247    const char *errors);
1248
1249PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
1250    const Py_UNICODE *data,     /* Unicode char buffer */
1251    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1252    const char *errors          /* error handling */
1253    );
1254#endif
1255
1256/* --- UTF-32 Codecs ------------------------------------------------------ */
1257
1258/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1259   the corresponding Unicode object.
1260
1261   errors (if non-NULL) defines the error handling. It defaults
1262   to "strict".
1263
1264   If byteorder is non-NULL, the decoder starts decoding using the
1265   given byte order:
1266
1267    *byteorder == -1: little endian
1268    *byteorder == 0:  native order
1269    *byteorder == 1:  big endian
1270
1271   In native mode, the first four bytes of the stream are checked for a
1272   BOM mark. If found, the BOM mark is analysed, the byte order
1273   adjusted and the BOM skipped.  In the other modes, no BOM mark
1274   interpretation is done. After completion, *byteorder is set to the
1275   current byte order at the end of input data.
1276
1277   If byteorder is NULL, the codec starts in native order mode.
1278
1279*/
1280
1281PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
1282    const char *string,         /* UTF-32 encoded string */
1283    Py_ssize_t length,          /* size of string */
1284    const char *errors,         /* error handling */
1285    int *byteorder              /* pointer to byteorder to use
1286                                   0=native;-1=LE,1=BE; updated on
1287                                   exit */
1288    );
1289
1290PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
1291    const char *string,         /* UTF-32 encoded string */
1292    Py_ssize_t length,          /* size of string */
1293    const char *errors,         /* error handling */
1294    int *byteorder,             /* pointer to byteorder to use
1295                                   0=native;-1=LE,1=BE; updated on
1296                                   exit */
1297    Py_ssize_t *consumed        /* bytes consumed */
1298    );
1299
1300/* Returns a Python string using the UTF-32 encoding in native byte
1301   order. The string always starts with a BOM mark.  */
1302
1303PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
1304    PyObject *unicode           /* Unicode object */
1305    );
1306
1307/* Returns a Python string object holding the UTF-32 encoded value of
1308   the Unicode data.
1309
1310   If byteorder is not 0, output is written according to the following
1311   byte order:
1312
1313   byteorder == -1: little endian
1314   byteorder == 0:  native byte order (writes a BOM mark)
1315   byteorder == 1:  big endian
1316
1317   If byteorder is 0, the output string will always start with the
1318   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1319   prepended.
1320
1321*/
1322
1323#ifndef Py_LIMITED_API
1324PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
1325    const Py_UNICODE *data,     /* Unicode char buffer */
1326    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1327    const char *errors,         /* error handling */
1328    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1329    );
1330PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1331    PyObject *object,           /* Unicode object */
1332    const char *errors,         /* error handling */
1333    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1334    );
1335#endif
1336
1337/* --- UTF-16 Codecs ------------------------------------------------------ */
1338
1339/* Decodes length bytes from a UTF-16 encoded buffer string and returns
1340   the corresponding Unicode object.
1341
1342   errors (if non-NULL) defines the error handling. It defaults
1343   to "strict".
1344
1345   If byteorder is non-NULL, the decoder starts decoding using the
1346   given byte order:
1347
1348    *byteorder == -1: little endian
1349    *byteorder == 0:  native order
1350    *byteorder == 1:  big endian
1351
1352   In native mode, the first two bytes of the stream are checked for a
1353   BOM mark. If found, the BOM mark is analysed, the byte order
1354   adjusted and the BOM skipped.  In the other modes, no BOM mark
1355   interpretation is done. After completion, *byteorder is set to the
1356   current byte order at the end of input data.
1357
1358   If byteorder is NULL, the codec starts in native order mode.
1359
1360*/
1361
1362PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
1363    const char *string,         /* UTF-16 encoded string */
1364    Py_ssize_t length,          /* size of string */
1365    const char *errors,         /* error handling */
1366    int *byteorder              /* pointer to byteorder to use
1367                                   0=native;-1=LE,1=BE; updated on
1368                                   exit */
1369    );
1370
1371PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
1372    const char *string,         /* UTF-16 encoded string */
1373    Py_ssize_t length,          /* size of string */
1374    const char *errors,         /* error handling */
1375    int *byteorder,             /* pointer to byteorder to use
1376                                   0=native;-1=LE,1=BE; updated on
1377                                   exit */
1378    Py_ssize_t *consumed        /* bytes consumed */
1379    );
1380
1381/* Returns a Python string using the UTF-16 encoding in native byte
1382   order. The string always starts with a BOM mark.  */
1383
1384PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
1385    PyObject *unicode           /* Unicode object */
1386    );
1387
1388/* Returns a Python string object holding the UTF-16 encoded value of
1389   the Unicode data.
1390
1391   If byteorder is not 0, output is written according to the following
1392   byte order:
1393
1394   byteorder == -1: little endian
1395   byteorder == 0:  native byte order (writes a BOM mark)
1396   byteorder == 1:  big endian
1397
1398   If byteorder is 0, the output string will always start with the
1399   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1400   prepended.
1401
1402   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1403   UCS-2. This trick makes it possible to add full UTF-16 capabilities
1404   at a later point without compromising the APIs.
1405
1406*/
1407
1408#ifndef Py_LIMITED_API
1409PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1410    const Py_UNICODE *data,     /* Unicode char buffer */
1411    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1412    const char *errors,         /* error handling */
1413    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1414    );
1415PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1416    PyObject* unicode,          /* Unicode object */
1417    const char *errors,         /* error handling */
1418    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1419    );
1420#endif
1421
1422/* --- Unicode-Escape Codecs ---------------------------------------------- */
1423
1424PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1425    const char *string,         /* Unicode-Escape encoded string */
1426    Py_ssize_t length,          /* size of string */
1427    const char *errors          /* error handling */
1428    );
1429
1430PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1431    PyObject *unicode           /* Unicode object */
1432    );
1433
1434#ifndef Py_LIMITED_API
1435PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1436    const Py_UNICODE *data,     /* Unicode char buffer */
1437    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1438    );
1439#endif
1440
1441/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1442
1443PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1444    const char *string,         /* Raw-Unicode-Escape encoded string */
1445    Py_ssize_t length,          /* size of string */
1446    const char *errors          /* error handling */
1447    );
1448
1449PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1450    PyObject *unicode           /* Unicode object */
1451    );
1452
1453#ifndef Py_LIMITED_API
1454PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1455    const Py_UNICODE *data,     /* Unicode char buffer */
1456    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1457    );
1458#endif
1459
1460/* --- Unicode Internal Codec ---------------------------------------------
1461
1462    Only for internal use in _codecsmodule.c */
1463
1464#ifndef Py_LIMITED_API
1465PyObject *_PyUnicode_DecodeUnicodeInternal(
1466    const char *string,
1467    Py_ssize_t length,
1468    const char *errors
1469    );
1470#endif
1471
1472/* --- Latin-1 Codecs -----------------------------------------------------
1473
1474   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1475
1476*/
1477
1478PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1479    const char *string,         /* Latin-1 encoded string */
1480    Py_ssize_t length,          /* size of string */
1481    const char *errors          /* error handling */
1482    );
1483
1484PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1485    PyObject *unicode           /* Unicode object */
1486    );
1487
1488#ifndef Py_LIMITED_API
1489PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1490    PyObject* unicode,
1491    const char* errors);
1492
1493PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1494    const Py_UNICODE *data,     /* Unicode char buffer */
1495    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1496    const char *errors          /* error handling */
1497    );
1498#endif
1499
1500/* --- ASCII Codecs -------------------------------------------------------
1501
1502   Only 7-bit ASCII data is excepted. All other codes generate errors.
1503
1504*/
1505
1506PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1507    const char *string,         /* ASCII encoded string */
1508    Py_ssize_t length,          /* size of string */
1509    const char *errors          /* error handling */
1510    );
1511
1512PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1513    PyObject *unicode           /* Unicode object */
1514    );
1515
1516#ifndef Py_LIMITED_API
1517PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1518    PyObject* unicode,
1519    const char* errors);
1520
1521PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1522    const Py_UNICODE *data,     /* Unicode char buffer */
1523    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1524    const char *errors          /* error handling */
1525    );
1526#endif
1527
1528/* --- Character Map Codecs -----------------------------------------------
1529
1530   This codec uses mappings to encode and decode characters.
1531
1532   Decoding mappings must map single string characters to single
1533   Unicode characters, integers (which are then interpreted as Unicode
1534   ordinals) or None (meaning "undefined mapping" and causing an
1535   error).
1536
1537   Encoding mappings must map single Unicode characters to single
1538   string characters, integers (which are then interpreted as Latin-1
1539   ordinals) or None (meaning "undefined mapping" and causing an
1540   error).
1541
1542   If a character lookup fails with a LookupError, the character is
1543   copied as-is meaning that its ordinal value will be interpreted as
1544   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1545   to contain those mappings which map characters to different code
1546   points.
1547
1548*/
1549
1550PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1551    const char *string,         /* Encoded string */
1552    Py_ssize_t length,          /* size of string */
1553    PyObject *mapping,          /* character mapping
1554                                   (char ordinal -> unicode ordinal) */
1555    const char *errors          /* error handling */
1556    );
1557
1558PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1559    PyObject *unicode,          /* Unicode object */
1560    PyObject *mapping           /* character mapping
1561                                   (unicode ordinal -> char ordinal) */
1562    );
1563
1564#ifndef Py_LIMITED_API
1565PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1566    const Py_UNICODE *data,     /* Unicode char buffer */
1567    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1568    PyObject *mapping,          /* character mapping
1569                                   (unicode ordinal -> char ordinal) */
1570    const char *errors          /* error handling */
1571    );
1572PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1573    PyObject *unicode,          /* Unicode object */
1574    PyObject *mapping,          /* character mapping
1575                                   (unicode ordinal -> char ordinal) */
1576    const char *errors          /* error handling */
1577    );
1578#endif
1579
1580/* Translate a Py_UNICODE buffer of the given length by applying a
1581   character mapping table to it and return the resulting Unicode
1582   object.
1583
1584   The mapping table must map Unicode ordinal integers to Unicode
1585   ordinal integers or None (causing deletion of the character).
1586
1587   Mapping tables may be dictionaries or sequences. Unmapped character
1588   ordinals (ones which cause a LookupError) are left untouched and
1589   are copied as-is.
1590
1591*/
1592
1593#ifndef Py_LIMITED_API
1594PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1595    const Py_UNICODE *data,     /* Unicode char buffer */
1596    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1597    PyObject *table,            /* Translate table */
1598    const char *errors          /* error handling */
1599    );
1600#endif
1601
1602#ifdef HAVE_MBCS
1603
1604/* --- MBCS codecs for Windows -------------------------------------------- */
1605
1606PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1607    const char *string,         /* MBCS encoded string */
1608    Py_ssize_t length,              /* size of string */
1609    const char *errors          /* error handling */
1610    );
1611
1612PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1613    const char *string,         /* MBCS encoded string */
1614    Py_ssize_t length,          /* size of string */
1615    const char *errors,         /* error handling */
1616    Py_ssize_t *consumed        /* bytes consumed */
1617    );
1618
1619PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1620    int code_page,              /* code page number */
1621    const char *string,         /* encoded string */
1622    Py_ssize_t length,          /* size of string */
1623    const char *errors,         /* error handling */
1624    Py_ssize_t *consumed        /* bytes consumed */
1625    );
1626
1627PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1628    PyObject *unicode           /* Unicode object */
1629    );
1630
1631#ifndef Py_LIMITED_API
1632PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1633    const Py_UNICODE *data,     /* Unicode char buffer */
1634    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1635    const char *errors          /* error handling */
1636    );
1637#endif
1638
1639PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1640    int code_page,              /* code page number */
1641    PyObject *unicode,          /* Unicode object */
1642    const char *errors          /* error handling */
1643    );
1644
1645#endif /* HAVE_MBCS */
1646
1647/* --- Decimal Encoder ---------------------------------------------------- */
1648
1649/* Takes a Unicode string holding a decimal value and writes it into
1650   an output buffer using standard ASCII digit codes.
1651
1652   The output buffer has to provide at least length+1 bytes of storage
1653   area. The output string is 0-terminated.
1654
1655   The encoder converts whitespace to ' ', decimal characters to their
1656   corresponding ASCII digit and all other Latin-1 characters except
1657   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1658   are treated as errors. This includes embedded NULL bytes.
1659
1660   Error handling is defined by the errors argument:
1661
1662      NULL or "strict": raise a ValueError
1663      "ignore": ignore the wrong characters (these are not copied to the
1664                output buffer)
1665      "replace": replaces illegal characters with '?'
1666
1667   Returns 0 on success, -1 on failure.
1668
1669*/
1670
1671#ifndef Py_LIMITED_API
1672PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1673    Py_UNICODE *s,              /* Unicode buffer */
1674    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1675    char *output,               /* Output buffer; must have size >= length */
1676    const char *errors          /* error handling */
1677    );
1678#endif
1679
1680/* Transforms code points that have decimal digit property to the
1681   corresponding ASCII digit code points.
1682
1683   Returns a new Unicode string on success, NULL on failure.
1684*/
1685
1686#ifndef Py_LIMITED_API
1687PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1688    Py_UNICODE *s,              /* Unicode buffer */
1689    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
1690    );
1691#endif
1692
1693/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
1694   as argument instead of a raw buffer and length.  This function additionally
1695   transforms spaces to ASCII because this is what the callers in longobject,
1696   floatobject, and complexobject did anyways. */
1697
1698#ifndef Py_LIMITED_API
1699PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1700    PyObject *unicode           /* Unicode object */
1701    );
1702#endif
1703
1704/* --- Locale encoding --------------------------------------------------- */
1705
1706/* Decode a string from the current locale encoding. The decoder is strict if
1707   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1708   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1709   be decoded as a surrogate character and *surrogateescape* is not equal to
1710   zero, the byte sequence is escaped using the 'surrogateescape' error handler
1711   instead of being decoded. *str* must end with a null character but cannot
1712   contain embedded null characters. */
1713
1714PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1715    const char *str,
1716    Py_ssize_t len,
1717    const char *errors);
1718
1719/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1720   length using strlen(). */
1721
1722PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1723    const char *str,
1724    const char *errors);
1725
1726/* Encode a Unicode object to the current locale encoding. The encoder is
1727   strict is *surrogateescape* is equal to zero, otherwise the
1728   "surrogateescape" error handler is used. Return a bytes object. The string
1729   cannot contain embedded null characters.. */
1730
1731PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1732    PyObject *unicode,
1733    const char *errors
1734    );
1735
1736/* --- File system encoding ---------------------------------------------- */
1737
1738/* ParseTuple converter: encode str objects to bytes using
1739   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1740
1741PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1742
1743/* ParseTuple converter: decode bytes objects to unicode using
1744   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1745
1746PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1747
1748/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1749   and the "surrogateescape" error handler.
1750
1751   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1752   encoding.
1753
1754   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1755*/
1756
1757PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1758    const char *s               /* encoded string */
1759    );
1760
1761/* Decode a string using Py_FileSystemDefaultEncoding
1762   and the "surrogateescape" error handler.
1763
1764   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1765   encoding.
1766*/
1767
1768PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1769    const char *s,               /* encoded string */
1770    Py_ssize_t size              /* size */
1771    );
1772
1773/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1774   "surrogateescape" error handler, and return bytes.
1775
1776   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1777   encoding.
1778*/
1779
1780PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1781    PyObject *unicode
1782    );
1783
1784/* --- Methods & Slots ----------------------------------------------------
1785
1786   These are capable of handling Unicode objects and strings on input
1787   (we refer to them as strings in the descriptions) and return
1788   Unicode objects or integers as appropriate. */
1789
1790/* Concat two strings giving a new Unicode string. */
1791
1792PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1793    PyObject *left,             /* Left string */
1794    PyObject *right             /* Right string */
1795    );
1796
1797/* Concat two strings and put the result in *pleft
1798   (sets *pleft to NULL on error) */
1799
1800PyAPI_FUNC(void) PyUnicode_Append(
1801    PyObject **pleft,           /* Pointer to left string */
1802    PyObject *right             /* Right string */
1803    );
1804
1805/* Concat two strings, put the result in *pleft and drop the right object
1806   (sets *pleft to NULL on error) */
1807
1808PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1809    PyObject **pleft,           /* Pointer to left string */
1810    PyObject *right             /* Right string */
1811    );
1812
1813/* Split a string giving a list of Unicode strings.
1814
1815   If sep is NULL, splitting will be done at all whitespace
1816   substrings. Otherwise, splits occur at the given separator.
1817
1818   At most maxsplit splits will be done. If negative, no limit is set.
1819
1820   Separators are not included in the resulting list.
1821
1822*/
1823
1824PyAPI_FUNC(PyObject*) PyUnicode_Split(
1825    PyObject *s,                /* String to split */
1826    PyObject *sep,              /* String separator */
1827    Py_ssize_t maxsplit         /* Maxsplit count */
1828    );
1829
1830/* Dito, but split at line breaks.
1831
1832   CRLF is considered to be one line break. Line breaks are not
1833   included in the resulting list. */
1834
1835PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1836    PyObject *s,                /* String to split */
1837    int keepends                /* If true, line end markers are included */
1838    );
1839
1840/* Partition a string using a given separator. */
1841
1842PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1843    PyObject *s,                /* String to partition */
1844    PyObject *sep               /* String separator */
1845    );
1846
1847/* Partition a string using a given separator, searching from the end of the
1848   string. */
1849
1850PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1851    PyObject *s,                /* String to partition */
1852    PyObject *sep               /* String separator */
1853    );
1854
1855/* Split a string giving a list of Unicode strings.
1856
1857   If sep is NULL, splitting will be done at all whitespace
1858   substrings. Otherwise, splits occur at the given separator.
1859
1860   At most maxsplit splits will be done. But unlike PyUnicode_Split
1861   PyUnicode_RSplit splits from the end of the string. If negative,
1862   no limit is set.
1863
1864   Separators are not included in the resulting list.
1865
1866*/
1867
1868PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1869    PyObject *s,                /* String to split */
1870    PyObject *sep,              /* String separator */
1871    Py_ssize_t maxsplit         /* Maxsplit count */
1872    );
1873
1874/* Translate a string by applying a character mapping table to it and
1875   return the resulting Unicode object.
1876
1877   The mapping table must map Unicode ordinal integers to Unicode
1878   ordinal integers or None (causing deletion of the character).
1879
1880   Mapping tables may be dictionaries or sequences. Unmapped character
1881   ordinals (ones which cause a LookupError) are left untouched and
1882   are copied as-is.
1883
1884*/
1885
1886PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1887    PyObject *str,              /* String */
1888    PyObject *table,            /* Translate table */
1889    const char *errors          /* error handling */
1890    );
1891
1892/* Join a sequence of strings using the given separator and return
1893   the resulting Unicode string. */
1894
1895PyAPI_FUNC(PyObject*) PyUnicode_Join(
1896    PyObject *separator,        /* Separator string */
1897    PyObject *seq               /* Sequence object */
1898    );
1899
1900/* Return 1 if substr matches str[start:end] at the given tail end, 0
1901   otherwise. */
1902
1903PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1904    PyObject *str,              /* String */
1905    PyObject *substr,           /* Prefix or Suffix string */
1906    Py_ssize_t start,           /* Start index */
1907    Py_ssize_t end,             /* Stop index */
1908    int direction               /* Tail end: -1 prefix, +1 suffix */
1909    );
1910
1911/* Return the first position of substr in str[start:end] using the
1912   given search direction or -1 if not found. -2 is returned in case
1913   an error occurred and an exception is set. */
1914
1915PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1916    PyObject *str,              /* String */
1917    PyObject *substr,           /* Substring to find */
1918    Py_ssize_t start,           /* Start index */
1919    Py_ssize_t end,             /* Stop index */
1920    int direction               /* Find direction: +1 forward, -1 backward */
1921    );
1922
1923/* Like PyUnicode_Find, but search for single character only. */
1924PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1925    PyObject *str,
1926    Py_UCS4 ch,
1927    Py_ssize_t start,
1928    Py_ssize_t end,
1929    int direction
1930    );
1931
1932/* Count the number of occurrences of substr in str[start:end]. */
1933
1934PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1935    PyObject *str,              /* String */
1936    PyObject *substr,           /* Substring to count */
1937    Py_ssize_t start,           /* Start index */
1938    Py_ssize_t end              /* Stop index */
1939    );
1940
1941/* Replace at most maxcount occurrences of substr in str with replstr
1942   and return the resulting Unicode object. */
1943
1944PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1945    PyObject *str,              /* String */
1946    PyObject *substr,           /* Substring to find */
1947    PyObject *replstr,          /* Substring to replace */
1948    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1949                                   -1 = all */
1950    );
1951
1952/* Compare two strings and return -1, 0, 1 for less than, equal,
1953   greater than resp.
1954   Raise an exception and return -1 on error. */
1955
1956PyAPI_FUNC(int) PyUnicode_Compare(
1957    PyObject *left,             /* Left string */
1958    PyObject *right             /* Right string */
1959    );
1960
1961PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1962    PyObject *left,
1963    const char *right           /* ASCII-encoded string */
1964    );
1965
1966/* Rich compare two strings and return one of the following:
1967
1968   - NULL in case an exception was raised
1969   - Py_True or Py_False for successfully comparisons
1970   - Py_NotImplemented in case the type combination is unknown
1971
1972   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1973   case the conversion of the arguments to Unicode fails with a
1974   UnicodeDecodeError.
1975
1976   Possible values for op:
1977
1978     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1979
1980*/
1981
1982PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1983    PyObject *left,             /* Left string */
1984    PyObject *right,            /* Right string */
1985    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1986    );
1987
1988/* Apply a argument tuple or dictionary to a format string and return
1989   the resulting Unicode string. */
1990
1991PyAPI_FUNC(PyObject *) PyUnicode_Format(
1992    PyObject *format,           /* Format string */
1993    PyObject *args              /* Argument tuple or dictionary */
1994    );
1995
1996/* Checks whether element is contained in container and return 1/0
1997   accordingly.
1998
1999   element has to coerce to an one element Unicode string. -1 is
2000   returned in case of an error. */
2001
2002PyAPI_FUNC(int) PyUnicode_Contains(
2003    PyObject *container,        /* Container string */
2004    PyObject *element           /* Element string */
2005    );
2006
2007/* Checks whether the string contains any NUL characters. */
2008
2009#ifndef Py_LIMITED_API
2010PyAPI_FUNC(int) _PyUnicode_HasNULChars(PyObject *);
2011#endif
2012
2013/* Checks whether argument is a valid identifier. */
2014
2015PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2016
2017#ifndef Py_LIMITED_API
2018/* Externally visible for str.strip(unicode) */
2019PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
2020    PyObject *self,
2021    int striptype,
2022    PyObject *sepobj
2023    );
2024#endif
2025
2026/* Using explicit passed-in values, insert the thousands grouping
2027   into the string pointed to by buffer.  For the argument descriptions,
2028   see Objects/stringlib/localeutil.h */
2029#ifndef Py_LIMITED_API
2030PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
2031    PyObject *unicode,
2032    Py_ssize_t index,
2033    Py_ssize_t n_buffer,
2034    void *digits,
2035    Py_ssize_t n_digits,
2036    Py_ssize_t min_width,
2037    const char *grouping,
2038    PyObject *thousands_sep,
2039    Py_UCS4 *maxchar);
2040#endif
2041/* === Characters Type APIs =============================================== */
2042
2043/* Helper array used by Py_UNICODE_ISSPACE(). */
2044
2045#ifndef Py_LIMITED_API
2046PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2047
2048/* These should not be used directly. Use the Py_UNICODE_IS* and
2049   Py_UNICODE_TO* macros instead.
2050
2051   These APIs are implemented in Objects/unicodectype.c.
2052
2053*/
2054
2055PyAPI_FUNC(int) _PyUnicode_IsLowercase(
2056    Py_UCS4 ch       /* Unicode character */
2057    );
2058
2059PyAPI_FUNC(int) _PyUnicode_IsUppercase(
2060    Py_UCS4 ch       /* Unicode character */
2061    );
2062
2063PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
2064    Py_UCS4 ch       /* Unicode character */
2065    );
2066
2067PyAPI_FUNC(int) _PyUnicode_IsXidStart(
2068    Py_UCS4 ch       /* Unicode character */
2069    );
2070
2071PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
2072    Py_UCS4 ch       /* Unicode character */
2073    );
2074
2075PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
2076    const Py_UCS4 ch         /* Unicode character */
2077    );
2078
2079PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
2080    const Py_UCS4 ch         /* Unicode character */
2081    );
2082
2083PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2084    Py_UCS4 ch       /* Unicode character */
2085    );
2086
2087PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2088    Py_UCS4 ch       /* Unicode character */
2089    );
2090
2091PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2092    Py_UCS4 ch       /* Unicode character */
2093    );
2094
2095PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2096    Py_UCS4 ch,       /* Unicode character */
2097    Py_UCS4 *res
2098    );
2099
2100PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2101    Py_UCS4 ch,       /* Unicode character */
2102    Py_UCS4 *res
2103    );
2104
2105PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2106    Py_UCS4 ch,       /* Unicode character */
2107    Py_UCS4 *res
2108    );
2109
2110PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2111    Py_UCS4 ch,       /* Unicode character */
2112    Py_UCS4 *res
2113    );
2114
2115PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
2116    Py_UCS4 ch         /* Unicode character */
2117    );
2118
2119PyAPI_FUNC(int) _PyUnicode_IsCased(
2120    Py_UCS4 ch         /* Unicode character */
2121    );
2122
2123PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
2124    Py_UCS4 ch       /* Unicode character */
2125    );
2126
2127PyAPI_FUNC(int) _PyUnicode_ToDigit(
2128    Py_UCS4 ch       /* Unicode character */
2129    );
2130
2131PyAPI_FUNC(double) _PyUnicode_ToNumeric(
2132    Py_UCS4 ch       /* Unicode character */
2133    );
2134
2135PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
2136    Py_UCS4 ch       /* Unicode character */
2137    );
2138
2139PyAPI_FUNC(int) _PyUnicode_IsDigit(
2140    Py_UCS4 ch       /* Unicode character */
2141    );
2142
2143PyAPI_FUNC(int) _PyUnicode_IsNumeric(
2144    Py_UCS4 ch       /* Unicode character */
2145    );
2146
2147PyAPI_FUNC(int) _PyUnicode_IsPrintable(
2148    Py_UCS4 ch       /* Unicode character */
2149    );
2150
2151PyAPI_FUNC(int) _PyUnicode_IsAlpha(
2152    Py_UCS4 ch       /* Unicode character */
2153    );
2154
2155PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2156    const Py_UNICODE *u
2157    );
2158
2159PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
2160    Py_UNICODE *s1,
2161    const Py_UNICODE *s2);
2162
2163PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2164    Py_UNICODE *s1, const Py_UNICODE *s2);
2165
2166PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
2167    Py_UNICODE *s1,
2168    const Py_UNICODE *s2,
2169    size_t n);
2170
2171PyAPI_FUNC(int) Py_UNICODE_strcmp(
2172    const Py_UNICODE *s1,
2173    const Py_UNICODE *s2
2174    );
2175
2176PyAPI_FUNC(int) Py_UNICODE_strncmp(
2177    const Py_UNICODE *s1,
2178    const Py_UNICODE *s2,
2179    size_t n
2180    );
2181
2182PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
2183    const Py_UNICODE *s,
2184    Py_UNICODE c
2185    );
2186
2187PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
2188    const Py_UNICODE *s,
2189    Py_UNICODE c
2190    );
2191
2192/* Create a copy of a unicode string ending with a nul character. Return NULL
2193   and raise a MemoryError exception on memory allocation failure, otherwise
2194   return a new allocated buffer (use PyMem_Free() to free the buffer). */
2195
2196PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
2197    PyObject *unicode
2198    );
2199#endif /* Py_LIMITED_API */
2200
2201#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2202PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2203    PyObject *op,
2204    int check_content);
2205#endif
2206
2207/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2208PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2209/* Clear all static strings. */
2210PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2211
2212#ifdef __cplusplus
2213}
2214#endif
2215#endif /* !Py_UNICODEOBJECT_H */
2216