unicodeobject.h revision b3648576cd76232e618ecc227541c7b722355f6e
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75   for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Py_UNICODE was the native Unicode storage format (code unit) used by
87   Python and represents a single Unicode element in the Unicode type.
88   With PEP 393, Py_UNICODE is deprecated and replaced with a
89   typedef to wchar_t. */
90
91#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97   through the interface functions PyUnicode_FromWideChar(),
98   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
101# ifndef HAVE_WCHAR_H
102#  define HAVE_WCHAR_H
103# endif
104#endif
105
106#ifdef HAVE_WCHAR_H
107/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
108# ifdef _HAVE_BSDI
109#  include <time.h>
110# endif
111#  include <wchar.h>
112#endif
113
114/* Py_UCS4 and Py_UCS2 are typedefs for the respective
115   unicode representations. */
116typedef uint32_t Py_UCS4;
117typedef uint16_t Py_UCS2;
118typedef uint8_t Py_UCS1;
119
120/* --- Internal Unicode Operations ---------------------------------------- */
121
122/* Since splitting on whitespace is an important use case, and
123   whitespace in most situations is solely ASCII whitespace, we
124   optimize for the common case by using a quick look-up table
125   _Py_ascii_whitespace (see below) with an inlined check.
126
127 */
128#ifndef Py_LIMITED_API
129#define Py_UNICODE_ISSPACE(ch) \
130    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
131
132#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
133#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
134#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
135#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
136
137#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
138#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
139#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
140
141#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
142#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
143#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
144#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
145
146#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
147#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
148#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
149
150#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
151
152#define Py_UNICODE_ISALNUM(ch) \
153       (Py_UNICODE_ISALPHA(ch) || \
154    Py_UNICODE_ISDECIMAL(ch) || \
155    Py_UNICODE_ISDIGIT(ch) || \
156    Py_UNICODE_ISNUMERIC(ch))
157
158#define Py_UNICODE_COPY(target, source, length) \
159    memcpy((target), (source), (length)*sizeof(Py_UNICODE))
160
161#define Py_UNICODE_FILL(target, value, length) \
162    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
163        for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
164    } while (0)
165
166/* macros to work with surrogates */
167#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
168#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
169#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
170/* Join two surrogate characters and return a single Py_UCS4 value. */
171#define Py_UNICODE_JOIN_SURROGATES(high, low)  \
172    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
173      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
174/* high surrogate = top 10 bits added to D800 */
175#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
176/* low surrogate = bottom 10 bits added to DC00 */
177#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
178
179/* Check if substring matches at given offset.  The offset must be
180   valid, and the substring must not be empty. */
181
182#define Py_UNICODE_MATCH(string, offset, substring) \
183    ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
184     ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
185     !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
186
187#endif /* Py_LIMITED_API */
188
189#ifdef __cplusplus
190extern "C" {
191#endif
192
193/* --- Unicode Type ------------------------------------------------------- */
194
195#ifndef Py_LIMITED_API
196
197/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
198   structure. state.ascii and state.compact are set, and the data
199   immediately follow the structure. utf8_length and wstr_length can be found
200   in the length field; the utf8 pointer is equal to the data pointer. */
201typedef struct {
202    /* There are 4 forms of Unicode strings:
203
204       - compact ascii:
205
206         * structure = PyASCIIObject
207         * test: PyUnicode_IS_COMPACT_ASCII(op)
208         * kind = PyUnicode_1BYTE_KIND
209         * compact = 1
210         * ascii = 1
211         * ready = 1
212         * (length is the length of the utf8 and wstr strings)
213         * (data starts just after the structure)
214         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
215
216       - compact:
217
218         * structure = PyCompactUnicodeObject
219         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
220         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
221           PyUnicode_4BYTE_KIND
222         * compact = 1
223         * ready = 1
224         * ascii = 0
225         * utf8 is not shared with data
226         * utf8_length = 0 if utf8 is NULL
227         * wstr is shared with data and wstr_length=length
228           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
229           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
230         * wstr_length = 0 if wstr is NULL
231         * (data starts just after the structure)
232
233       - legacy string, not ready:
234
235         * structure = PyUnicodeObject
236         * test: kind == PyUnicode_WCHAR_KIND
237         * length = 0 (use wstr_length)
238         * hash = -1
239         * kind = PyUnicode_WCHAR_KIND
240         * compact = 0
241         * ascii = 0
242         * ready = 0
243         * interned = SSTATE_NOT_INTERNED
244         * wstr is not NULL
245         * data.any is NULL
246         * utf8 is NULL
247         * utf8_length = 0
248
249       - legacy string, ready:
250
251         * structure = PyUnicodeObject structure
252         * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
253         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
254           PyUnicode_4BYTE_KIND
255         * compact = 0
256         * ready = 1
257         * data.any is not NULL
258         * utf8 is shared and utf8_length = length with data.any if ascii = 1
259         * utf8_length = 0 if utf8 is NULL
260         * wstr is shared with data.any and wstr_length = length
261           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
262           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
263         * wstr_length = 0 if wstr is NULL
264
265       Compact strings use only one memory block (structure + characters),
266       whereas legacy strings use one block for the structure and one block
267       for characters.
268
269       Legacy strings are created by PyUnicode_FromUnicode() and
270       PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
271       when PyUnicode_READY() is called.
272
273       See also _PyUnicode_CheckConsistency().
274    */
275    PyObject_HEAD
276    Py_ssize_t length;          /* Number of code points in the string */
277    Py_hash_t hash;             /* Hash value; -1 if not set */
278    struct {
279        /*
280           SSTATE_NOT_INTERNED (0)
281           SSTATE_INTERNED_MORTAL (1)
282           SSTATE_INTERNED_IMMORTAL (2)
283
284           If interned != SSTATE_NOT_INTERNED, the two references from the
285           dictionary to this object are *not* counted in ob_refcnt.
286         */
287        unsigned int interned:2;
288        /* Character size:
289
290           - PyUnicode_WCHAR_KIND (0):
291
292             * character type = wchar_t (16 or 32 bits, depending on the
293               platform)
294
295           - PyUnicode_1BYTE_KIND (1):
296
297             * character type = Py_UCS1 (8 bits, unsigned)
298             * all characters are in the range U+0000-U+00FF (latin1)
299             * if ascii is set, all characters are in the range U+0000-U+007F
300               (ASCII), otherwise at least one character is in the range
301               U+0080-U+00FF
302
303           - PyUnicode_2BYTE_KIND (2):
304
305             * character type = Py_UCS2 (16 bits, unsigned)
306             * all characters are in the range U+0000-U+FFFF (BMP)
307             * at least one character is in the range U+0100-U+FFFF
308
309           - PyUnicode_4BYTE_KIND (4):
310
311             * character type = Py_UCS4 (32 bits, unsigned)
312             * all characters are in the range U+0000-U+10FFFF
313             * at least one character is in the range U+10000-U+10FFFF
314         */
315        unsigned int kind:3;
316        /* Compact is with respect to the allocation scheme. Compact unicode
317           objects only require one memory block while non-compact objects use
318           one block for the PyUnicodeObject struct and another for its data
319           buffer. */
320        unsigned int compact:1;
321        /* The string only contains characters in the range U+0000-U+007F (ASCII)
322           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
323           set, use the PyASCIIObject structure. */
324        unsigned int ascii:1;
325        /* The ready flag indicates whether the object layout is initialized
326           completely. This means that this is either a compact object, or
327           the data pointer is filled out. The bit is redundant, and helps
328           to minimize the test in PyUnicode_IS_READY(). */
329        unsigned int ready:1;
330        /* Padding to ensure that PyUnicode_DATA() is always aligned to
331           4 bytes (see issue #19537 on m68k). */
332        unsigned int :24;
333    } state;
334    wchar_t *wstr;              /* wchar_t representation (null-terminated) */
335} PyASCIIObject;
336
337/* Non-ASCII strings allocated through PyUnicode_New use the
338   PyCompactUnicodeObject structure. state.compact is set, and the data
339   immediately follow the structure. */
340typedef struct {
341    PyASCIIObject _base;
342    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
343                                 * terminating \0. */
344    char *utf8;                 /* UTF-8 representation (null-terminated) */
345    Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
346                                 * surrogates count as two code points. */
347} PyCompactUnicodeObject;
348
349/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
350   PyUnicodeObject structure. The actual string data is initially in the wstr
351   block, and copied into the data block using _PyUnicode_Ready. */
352typedef struct {
353    PyCompactUnicodeObject _base;
354    union {
355        void *any;
356        Py_UCS1 *latin1;
357        Py_UCS2 *ucs2;
358        Py_UCS4 *ucs4;
359    } data;                     /* Canonical, smallest-form Unicode buffer */
360} PyUnicodeObject;
361#endif
362
363PyAPI_DATA(PyTypeObject) PyUnicode_Type;
364PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
365
366#define PyUnicode_Check(op) \
367                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
368#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
369
370/* Fast access macros */
371#ifndef Py_LIMITED_API
372
373#define PyUnicode_WSTR_LENGTH(op) \
374    (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
375     ((PyASCIIObject*)op)->length :                    \
376     ((PyCompactUnicodeObject*)op)->wstr_length)
377
378/* Returns the deprecated Py_UNICODE representation's size in code units
379   (this includes surrogate pairs as 2 units).
380   If the Py_UNICODE representation is not available, it will be computed
381   on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
382
383#define PyUnicode_GET_SIZE(op)                       \
384    (assert(PyUnicode_Check(op)),                    \
385     (((PyASCIIObject *)(op))->wstr) ?               \
386      PyUnicode_WSTR_LENGTH(op) :                    \
387      ((void)PyUnicode_AsUnicode((PyObject *)(op)),  \
388       assert(((PyASCIIObject *)(op))->wstr),        \
389       PyUnicode_WSTR_LENGTH(op)))
390
391#define PyUnicode_GET_DATA_SIZE(op) \
392    (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
393
394/* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
395   representation on demand.  Using this macro is very inefficient now,
396   try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
397   use PyUnicode_WRITE() and PyUnicode_READ(). */
398
399#define PyUnicode_AS_UNICODE(op) \
400    (assert(PyUnicode_Check(op)), \
401     (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
402      PyUnicode_AsUnicode((PyObject *)(op)))
403
404#define PyUnicode_AS_DATA(op) \
405    ((const char *)(PyUnicode_AS_UNICODE(op)))
406
407
408/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
409
410/* Values for PyASCIIObject.state: */
411
412/* Interning state. */
413#define SSTATE_NOT_INTERNED 0
414#define SSTATE_INTERNED_MORTAL 1
415#define SSTATE_INTERNED_IMMORTAL 2
416
417/* Return true if the string contains only ASCII characters, or 0 if not. The
418   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
419   ready. */
420#define PyUnicode_IS_ASCII(op)                   \
421    (assert(PyUnicode_Check(op)),                \
422     assert(PyUnicode_IS_READY(op)),             \
423     ((PyASCIIObject*)op)->state.ascii)
424
425/* Return true if the string is compact or 0 if not.
426   No type checks or Ready calls are performed. */
427#define PyUnicode_IS_COMPACT(op) \
428    (((PyASCIIObject*)(op))->state.compact)
429
430/* Return true if the string is a compact ASCII string (use PyASCIIObject
431   structure), or 0 if not.  No type checks or Ready calls are performed. */
432#define PyUnicode_IS_COMPACT_ASCII(op)                 \
433    (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
434
435enum PyUnicode_Kind {
436/* String contains only wstr byte characters.  This is only possible
437   when the string was created with a legacy API and _PyUnicode_Ready()
438   has not been called yet.  */
439    PyUnicode_WCHAR_KIND = 0,
440/* Return values of the PyUnicode_KIND() macro: */
441    PyUnicode_1BYTE_KIND = 1,
442    PyUnicode_2BYTE_KIND = 2,
443    PyUnicode_4BYTE_KIND = 4
444};
445
446/* Return pointers to the canonical representation cast to unsigned char,
447   Py_UCS2, or Py_UCS4 for direct character access.
448   No checks are performed, use PyUnicode_KIND() before to ensure
449   these will work correctly. */
450
451#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
452#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
453#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
454
455/* Return one of the PyUnicode_*_KIND values defined above. */
456#define PyUnicode_KIND(op) \
457    (assert(PyUnicode_Check(op)), \
458     assert(PyUnicode_IS_READY(op)),            \
459     ((PyASCIIObject *)(op))->state.kind)
460
461/* Return a void pointer to the raw unicode buffer. */
462#define _PyUnicode_COMPACT_DATA(op)                     \
463    (PyUnicode_IS_ASCII(op) ?                   \
464     ((void*)((PyASCIIObject*)(op) + 1)) :              \
465     ((void*)((PyCompactUnicodeObject*)(op) + 1)))
466
467#define _PyUnicode_NONCOMPACT_DATA(op)                  \
468    (assert(((PyUnicodeObject*)(op))->data.any),        \
469     ((((PyUnicodeObject *)(op))->data.any)))
470
471#define PyUnicode_DATA(op) \
472    (assert(PyUnicode_Check(op)), \
473     PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
474     _PyUnicode_NONCOMPACT_DATA(op))
475
476/* In the access macros below, "kind" may be evaluated more than once.
477   All other macro parameters are evaluated exactly once, so it is safe
478   to put side effects into them (such as increasing the index). */
479
480/* Write into the canonical representation, this macro does not do any sanity
481   checks and is intended for usage in loops.  The caller should cache the
482   kind and data pointers obtained from other macro calls.
483   index is the index in the string (starts at 0) and value is the new
484   code point value which should be written to that location. */
485#define PyUnicode_WRITE(kind, data, index, value) \
486    do { \
487        switch ((kind)) { \
488        case PyUnicode_1BYTE_KIND: { \
489            ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
490            break; \
491        } \
492        case PyUnicode_2BYTE_KIND: { \
493            ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
494            break; \
495        } \
496        default: { \
497            assert((kind) == PyUnicode_4BYTE_KIND); \
498            ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
499        } \
500        } \
501    } while (0)
502
503/* Read a code point from the string's canonical representation.  No checks
504   or ready calls are performed. */
505#define PyUnicode_READ(kind, data, index) \
506    ((Py_UCS4) \
507    ((kind) == PyUnicode_1BYTE_KIND ? \
508        ((const Py_UCS1 *)(data))[(index)] : \
509        ((kind) == PyUnicode_2BYTE_KIND ? \
510            ((const Py_UCS2 *)(data))[(index)] : \
511            ((const Py_UCS4 *)(data))[(index)] \
512        ) \
513    ))
514
515/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
516   calls PyUnicode_KIND() and might call it twice.  For single reads, use
517   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
518   cache kind and use PyUnicode_READ instead. */
519#define PyUnicode_READ_CHAR(unicode, index) \
520    (assert(PyUnicode_Check(unicode)),          \
521     assert(PyUnicode_IS_READY(unicode)),       \
522     (Py_UCS4)                                  \
523        (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
524            ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
525            (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
526                ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
527                ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
528            ) \
529        ))
530
531/* Returns the length of the unicode string. The caller has to make sure that
532   the string has it's canonical representation set before calling
533   this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
534#define PyUnicode_GET_LENGTH(op)                \
535    (assert(PyUnicode_Check(op)),               \
536     assert(PyUnicode_IS_READY(op)),            \
537     ((PyASCIIObject *)(op))->length)
538
539
540/* Fast check to determine whether an object is ready. Equivalent to
541   PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
542
543#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
544
545/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
546   case.  If the canonical representation is not yet set, it will still call
547   _PyUnicode_Ready().
548   Returns 0 on success and -1 on errors. */
549#define PyUnicode_READY(op)                        \
550    (assert(PyUnicode_Check(op)),                       \
551     (PyUnicode_IS_READY(op) ?                          \
552      0 : _PyUnicode_Ready((PyObject *)(op))))
553
554/* Return a maximum character value which is suitable for creating another
555   string based on op.  This is always an approximation but more efficient
556   than iterating over the string. */
557#define PyUnicode_MAX_CHAR_VALUE(op) \
558    (assert(PyUnicode_IS_READY(op)),                                    \
559     (PyUnicode_IS_ASCII(op) ?                                          \
560      (0x7f) :                                                          \
561      (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
562       (0xffU) :                                                        \
563       (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
564        (0xffffU) :                                                     \
565        (0x10ffffU)))))
566
567#endif
568
569/* --- Constants ---------------------------------------------------------- */
570
571/* This Unicode character will be used as replacement character during
572   decoding if the errors argument is set to "replace". Note: the
573   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
574   Unicode 3.0. */
575
576#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
577
578/* === Public API ========================================================= */
579
580/* --- Plain Py_UNICODE --------------------------------------------------- */
581
582/* With PEP 393, this is the recommended way to allocate a new unicode object.
583   This function will allocate the object and its buffer in a single memory
584   block.  Objects created using this function are not resizable. */
585#ifndef Py_LIMITED_API
586PyAPI_FUNC(PyObject*) PyUnicode_New(
587    Py_ssize_t size,            /* Number of code points in the new string */
588    Py_UCS4 maxchar             /* maximum code point value in the string */
589    );
590#endif
591
592/* Initializes the canonical string representation from the deprecated
593   wstr/Py_UNICODE representation. This function is used to convert Unicode
594   objects which were created using the old API to the new flexible format
595   introduced with PEP 393.
596
597   Don't call this function directly, use the public PyUnicode_READY() macro
598   instead. */
599#ifndef Py_LIMITED_API
600PyAPI_FUNC(int) _PyUnicode_Ready(
601    PyObject *unicode           /* Unicode object */
602    );
603#endif
604
605/* Get a copy of a Unicode string. */
606#ifndef Py_LIMITED_API
607PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
608    PyObject *unicode
609    );
610#endif
611
612/* Copy character from one unicode object into another, this function performs
613   character conversion when necessary and falls back to memcpy() if possible.
614
615   Fail if to is too small (smaller than *how_many* or smaller than
616   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
617   kind(to), or if *to* has more than 1 reference.
618
619   Return the number of written character, or return -1 and raise an exception
620   on error.
621
622   Pseudo-code:
623
624       how_many = min(how_many, len(from) - from_start)
625       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
626       return how_many
627
628   Note: The function doesn't write a terminating null character.
629   */
630#ifndef Py_LIMITED_API
631PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
632    PyObject *to,
633    Py_ssize_t to_start,
634    PyObject *from,
635    Py_ssize_t from_start,
636    Py_ssize_t how_many
637    );
638
639/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
640   may crash if parameters are invalid (e.g. if the output string
641   is too short). */
642PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
643    PyObject *to,
644    Py_ssize_t to_start,
645    PyObject *from,
646    Py_ssize_t from_start,
647    Py_ssize_t how_many
648    );
649#endif
650
651#ifndef Py_LIMITED_API
652/* Fill a string with a character: write fill_char into
653   unicode[start:start+length].
654
655   Fail if fill_char is bigger than the string maximum character, or if the
656   string has more than 1 reference.
657
658   Return the number of written character, or return -1 and raise an exception
659   on error. */
660PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
661    PyObject *unicode,
662    Py_ssize_t start,
663    Py_ssize_t length,
664    Py_UCS4 fill_char
665    );
666
667/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
668   if parameters are invalid (e.g. if length is longer than the string). */
669PyAPI_FUNC(void) _PyUnicode_FastFill(
670    PyObject *unicode,
671    Py_ssize_t start,
672    Py_ssize_t length,
673    Py_UCS4 fill_char
674    );
675#endif
676
677/* Create a Unicode Object from the Py_UNICODE buffer u of the given
678   size.
679
680   u may be NULL which causes the contents to be undefined. It is the
681   user's responsibility to fill in the needed data afterwards. Note
682   that modifying the Unicode object contents after construction is
683   only allowed if u was set to NULL.
684
685   The buffer is copied into the new object. */
686
687#ifndef Py_LIMITED_API
688PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
689    const Py_UNICODE *u,        /* Unicode buffer */
690    Py_ssize_t size             /* size of buffer */
691    );
692#endif
693
694/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
695PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
696    const char *u,             /* UTF-8 encoded string */
697    Py_ssize_t size            /* size of buffer */
698    );
699
700/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
701   UTF-8 encoded bytes.  The size is determined with strlen(). */
702PyAPI_FUNC(PyObject*) PyUnicode_FromString(
703    const char *u              /* UTF-8 encoded string */
704    );
705
706#ifndef Py_LIMITED_API
707/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
708   Scan the string to find the maximum character. */
709PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
710    int kind,
711    const void *buffer,
712    Py_ssize_t size);
713
714/* Create a new string from a buffer of ASCII characters.
715   WARNING: Don't check if the string contains any non-ASCII character. */
716PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
717    const char *buffer,
718    Py_ssize_t size);
719#endif
720
721PyAPI_FUNC(PyObject*) PyUnicode_Substring(
722    PyObject *str,
723    Py_ssize_t start,
724    Py_ssize_t end);
725
726#ifndef Py_LIMITED_API
727/* Compute the maximum character of the substring unicode[start:end].
728   Return 127 for an empty string. */
729PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
730    PyObject *unicode,
731    Py_ssize_t start,
732    Py_ssize_t end);
733#endif
734
735/* Copy the string into a UCS4 buffer including the null character if copy_null
736   is set. Return NULL and raise an exception on error. Raise a SystemError if
737   the buffer is smaller than the string. Return buffer on success.
738
739   buflen is the length of the buffer in (Py_UCS4) characters. */
740PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
741    PyObject *unicode,
742    Py_UCS4* buffer,
743    Py_ssize_t buflen,
744    int copy_null);
745
746/* Copy the string into a UCS4 buffer. A new buffer is allocated using
747 * PyMem_Malloc; if this fails, NULL is returned with a memory error
748   exception set. */
749PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
750
751/* Return a read-only pointer to the Unicode object's internal
752   Py_UNICODE buffer.
753   If the wchar_t/Py_UNICODE representation is not yet available, this
754   function will calculate it. */
755
756#ifndef Py_LIMITED_API
757PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
758    PyObject *unicode           /* Unicode object */
759    );
760#endif
761
762/* Return a read-only pointer to the Unicode object's internal
763   Py_UNICODE buffer and save the length at size.
764   If the wchar_t/Py_UNICODE representation is not yet available, this
765   function will calculate it. */
766
767#ifndef Py_LIMITED_API
768PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
769    PyObject *unicode,          /* Unicode object */
770    Py_ssize_t *size            /* location where to save the length */
771    );
772#endif
773
774/* Get the length of the Unicode object. */
775
776PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
777    PyObject *unicode
778);
779
780/* Get the number of Py_UNICODE units in the
781   string representation. */
782
783PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
784    PyObject *unicode           /* Unicode object */
785    );
786
787/* Read a character from the string. */
788
789PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
790    PyObject *unicode,
791    Py_ssize_t index
792    );
793
794/* Write a character to the string. The string must have been created through
795   PyUnicode_New, must not be shared, and must not have been hashed yet.
796
797   Return 0 on success, -1 on error. */
798
799PyAPI_FUNC(int) PyUnicode_WriteChar(
800    PyObject *unicode,
801    Py_ssize_t index,
802    Py_UCS4 character
803    );
804
805#ifndef Py_LIMITED_API
806/* Get the maximum ordinal for a Unicode character. */
807PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
808#endif
809
810/* Resize a Unicode object. The length is the number of characters, except
811   if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
812   is the number of Py_UNICODE characters.
813
814   *unicode is modified to point to the new (resized) object and 0
815   returned on success.
816
817   Try to resize the string in place (which is usually faster than allocating
818   a new string and copy characters), or create a new string.
819
820   Error handling is implemented as follows: an exception is set, -1
821   is returned and *unicode left untouched.
822
823   WARNING: The function doesn't check string content, the result may not be a
824            string in canonical representation. */
825
826PyAPI_FUNC(int) PyUnicode_Resize(
827    PyObject **unicode,         /* Pointer to the Unicode object */
828    Py_ssize_t length           /* New length */
829    );
830
831/* Decode obj to a Unicode object.
832
833   bytes, bytearray and other bytes-like objects are decoded according to the
834   given encoding and error handler. The encoding and error handler can be
835   NULL to have the interface use UTF-8 and "strict".
836
837   All other objects (including Unicode objects) raise an exception.
838
839   The API returns NULL in case of an error. The caller is responsible
840   for decref'ing the returned objects.
841
842*/
843
844PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
845    PyObject *obj,              /* Object */
846    const char *encoding,       /* encoding */
847    const char *errors          /* error handling */
848    );
849
850/* Copy an instance of a Unicode subtype to a new true Unicode object if
851   necessary. If obj is already a true Unicode object (not a subtype), return
852   the reference with *incremented* refcount.
853
854   The API returns NULL in case of an error. The caller is responsible
855   for decref'ing the returned objects.
856
857*/
858
859PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
860    PyObject *obj      /* Object */
861    );
862
863PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
864    const char *format,   /* ASCII-encoded string  */
865    va_list vargs
866    );
867PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
868    const char *format,   /* ASCII-encoded string  */
869    ...
870    );
871
872#ifndef Py_LIMITED_API
873typedef struct {
874    PyObject *buffer;
875    void *data;
876    enum PyUnicode_Kind kind;
877    Py_UCS4 maxchar;
878    Py_ssize_t size;
879    Py_ssize_t pos;
880
881    /* minimum number of allocated characters (default: 0) */
882    Py_ssize_t min_length;
883
884    /* minimum character (default: 127, ASCII) */
885    Py_UCS4 min_char;
886
887    /* If non-zero, overallocate the buffer (default: 0). */
888    unsigned char overallocate;
889
890    /* If readonly is 1, buffer is a shared string (cannot be modified)
891       and size is set to 0. */
892    unsigned char readonly;
893} _PyUnicodeWriter ;
894
895/* Initialize a Unicode writer.
896 *
897 * By default, the minimum buffer size is 0 character and overallocation is
898 * disabled. Set min_length, min_char and overallocate attributes to control
899 * the allocation of the buffer. */
900PyAPI_FUNC(void)
901_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
902
903/* Prepare the buffer to write 'length' characters
904   with the specified maximum character.
905
906   Return 0 on success, raise an exception and return -1 on error. */
907#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
908    (((MAXCHAR) <= (WRITER)->maxchar                                  \
909      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
910     ? 0                                                              \
911     : (((LENGTH) == 0)                                               \
912        ? 0                                                           \
913        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
914
915/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
916   instead. */
917PyAPI_FUNC(int)
918_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
919                                 Py_ssize_t length, Py_UCS4 maxchar);
920
921/* Prepare the buffer to have at least the kind KIND.
922   For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
923   support characters in range U+000-U+FFFF.
924
925   Return 0 on success, raise an exception and return -1 on error. */
926#define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
927    (assert((KIND) != PyUnicode_WCHAR_KIND),                          \
928     (KIND) <= (WRITER)->kind                                         \
929     ? 0                                                              \
930     : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
931
932/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
933   macro instead. */
934PyAPI_FUNC(int)
935_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
936                                     enum PyUnicode_Kind kind);
937
938/* Append a Unicode character.
939   Return 0 on success, raise an exception and return -1 on error. */
940PyAPI_FUNC(int)
941_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
942    Py_UCS4 ch
943    );
944
945/* Append a Unicode string.
946   Return 0 on success, raise an exception and return -1 on error. */
947PyAPI_FUNC(int)
948_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
949    PyObject *str               /* Unicode string */
950    );
951
952/* Append a substring of a Unicode string.
953   Return 0 on success, raise an exception and return -1 on error. */
954PyAPI_FUNC(int)
955_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
956    PyObject *str,              /* Unicode string */
957    Py_ssize_t start,
958    Py_ssize_t end
959    );
960
961/* Append an ASCII-encoded byte string.
962   Return 0 on success, raise an exception and return -1 on error. */
963PyAPI_FUNC(int)
964_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
965    const char *str,           /* ASCII-encoded byte string */
966    Py_ssize_t len             /* number of bytes, or -1 if unknown */
967    );
968
969/* Append a latin1-encoded byte string.
970   Return 0 on success, raise an exception and return -1 on error. */
971PyAPI_FUNC(int)
972_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
973    const char *str,           /* latin1-encoded byte string */
974    Py_ssize_t len             /* length in bytes */
975    );
976
977/* Get the value of the writer as a Unicode string. Clear the
978   buffer of the writer. Raise an exception and return NULL
979   on error. */
980PyAPI_FUNC(PyObject *)
981_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
982
983/* Deallocate memory of a writer (clear its internal buffer). */
984PyAPI_FUNC(void)
985_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
986#endif
987
988#ifndef Py_LIMITED_API
989/* Format the object based on the format_spec, as defined in PEP 3101
990   (Advanced String Formatting). */
991PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
992    _PyUnicodeWriter *writer,
993    PyObject *obj,
994    PyObject *format_spec,
995    Py_ssize_t start,
996    Py_ssize_t end);
997#endif
998
999PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
1000PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
1001PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
1002    const char *u              /* UTF-8 encoded string */
1003    );
1004#ifndef Py_LIMITED_API
1005PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
1006#endif
1007
1008/* Use only if you know it's a string */
1009#define PyUnicode_CHECK_INTERNED(op) \
1010    (((PyASCIIObject *)(op))->state.interned)
1011
1012/* --- wchar_t support for platforms which support it --------------------- */
1013
1014#ifdef HAVE_WCHAR_H
1015
1016/* Create a Unicode Object from the wchar_t buffer w of the given
1017   size.
1018
1019   The buffer is copied into the new object. */
1020
1021PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
1022    const wchar_t *w,           /* wchar_t buffer */
1023    Py_ssize_t size             /* size of buffer */
1024    );
1025
1026/* Copies the Unicode Object contents into the wchar_t buffer w.  At
1027   most size wchar_t characters are copied.
1028
1029   Note that the resulting wchar_t string may or may not be
1030   0-terminated.  It is the responsibility of the caller to make sure
1031   that the wchar_t string is 0-terminated in case this is required by
1032   the application.
1033
1034   Returns the number of wchar_t characters copied (excluding a
1035   possibly trailing 0-termination character) or -1 in case of an
1036   error. */
1037
1038PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
1039    PyObject *unicode,          /* Unicode object */
1040    wchar_t *w,                 /* wchar_t buffer */
1041    Py_ssize_t size             /* size of buffer */
1042    );
1043
1044/* Convert the Unicode object to a wide character string. The output string
1045   always ends with a nul character. If size is not NULL, write the number of
1046   wide characters (excluding the null character) into *size.
1047
1048   Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
1049   on success. On error, returns NULL, *size is undefined and raises a
1050   MemoryError. */
1051
1052PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
1053    PyObject *unicode,          /* Unicode object */
1054    Py_ssize_t *size            /* number of characters of the result */
1055    );
1056
1057#ifndef Py_LIMITED_API
1058PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
1059#endif
1060
1061#endif
1062
1063/* --- Unicode ordinals --------------------------------------------------- */
1064
1065/* Create a Unicode Object from the given Unicode code point ordinal.
1066
1067   The ordinal must be in range(0x110000). A ValueError is
1068   raised in case it is not.
1069
1070*/
1071
1072PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
1073
1074/* --- Free-list management ----------------------------------------------- */
1075
1076/* Clear the free list used by the Unicode implementation.
1077
1078   This can be used to release memory used for objects on the free
1079   list back to the Python memory allocator.
1080
1081*/
1082
1083PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1084
1085/* === Builtin Codecs =====================================================
1086
1087   Many of these APIs take two arguments encoding and errors. These
1088   parameters encoding and errors have the same semantics as the ones
1089   of the builtin str() API.
1090
1091   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
1092
1093   Error handling is set by errors which may also be set to NULL
1094   meaning to use the default handling defined for the codec. Default
1095   error handling for all builtin codecs is "strict" (ValueErrors are
1096   raised).
1097
1098   The codecs all use a similar interface. Only deviation from the
1099   generic ones are documented.
1100
1101*/
1102
1103/* --- Manage the default encoding ---------------------------------------- */
1104
1105/* Returns a pointer to the default encoding (UTF-8) of the
1106   Unicode object unicode and the size of the encoded representation
1107   in bytes stored in *size.
1108
1109   In case of an error, no *size is set.
1110
1111   This function caches the UTF-8 encoded string in the unicodeobject
1112   and subsequent calls will return the same string.  The memory is released
1113   when the unicodeobject is deallocated.
1114
1115   _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1116   support the previous internal function with the same behaviour.
1117
1118   *** This API is for interpreter INTERNAL USE ONLY and will likely
1119   *** be removed or changed in the future.
1120
1121   *** If you need to access the Unicode object as UTF-8 bytes string,
1122   *** please use PyUnicode_AsUTF8String() instead.
1123*/
1124
1125#ifndef Py_LIMITED_API
1126PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
1127    PyObject *unicode,
1128    Py_ssize_t *size);
1129#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
1130#endif
1131
1132/* Returns a pointer to the default encoding (UTF-8) of the
1133   Unicode object unicode.
1134
1135   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1136   in the unicodeobject.
1137
1138   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1139   support the previous internal function with the same behaviour.
1140
1141   Use of this API is DEPRECATED since no size information can be
1142   extracted from the returned data.
1143
1144   *** This API is for interpreter INTERNAL USE ONLY and will likely
1145   *** be removed or changed for Python 3.1.
1146
1147   *** If you need to access the Unicode object as UTF-8 bytes string,
1148   *** please use PyUnicode_AsUTF8String() instead.
1149
1150*/
1151
1152#ifndef Py_LIMITED_API
1153PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1154#define _PyUnicode_AsString PyUnicode_AsUTF8
1155#endif
1156
1157/* Returns "utf-8".  */
1158
1159PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
1160
1161/* --- Generic Codecs ----------------------------------------------------- */
1162
1163/* Create a Unicode object by decoding the encoded string s of the
1164   given size. */
1165
1166PyAPI_FUNC(PyObject*) PyUnicode_Decode(
1167    const char *s,              /* encoded string */
1168    Py_ssize_t size,            /* size of buffer */
1169    const char *encoding,       /* encoding */
1170    const char *errors          /* error handling */
1171    );
1172
1173/* Decode a Unicode object unicode and return the result as Python
1174   object. */
1175
1176PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
1177    PyObject *unicode,          /* Unicode object */
1178    const char *encoding,       /* encoding */
1179    const char *errors          /* error handling */
1180    );
1181
1182/* Decode a Unicode object unicode and return the result as Unicode
1183   object. */
1184
1185PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
1186    PyObject *unicode,          /* Unicode object */
1187    const char *encoding,       /* encoding */
1188    const char *errors          /* error handling */
1189    );
1190
1191/* Encodes a Py_UNICODE buffer of the given size and returns a
1192   Python string object. */
1193
1194#ifndef Py_LIMITED_API
1195PyAPI_FUNC(PyObject*) PyUnicode_Encode(
1196    const Py_UNICODE *s,        /* Unicode char buffer */
1197    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
1198    const char *encoding,       /* encoding */
1199    const char *errors          /* error handling */
1200    );
1201#endif
1202
1203/* Encodes a Unicode object and returns the result as Python
1204   object. */
1205
1206PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
1207    PyObject *unicode,          /* Unicode object */
1208    const char *encoding,       /* encoding */
1209    const char *errors          /* error handling */
1210    );
1211
1212/* Encodes a Unicode object and returns the result as Python string
1213   object. */
1214
1215PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
1216    PyObject *unicode,          /* Unicode object */
1217    const char *encoding,       /* encoding */
1218    const char *errors          /* error handling */
1219    );
1220
1221/* Encodes a Unicode object and returns the result as Unicode
1222   object. */
1223
1224PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
1225    PyObject *unicode,          /* Unicode object */
1226    const char *encoding,       /* encoding */
1227    const char *errors          /* error handling */
1228    );
1229
1230/* Build an encoding map. */
1231
1232PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1233    PyObject* string            /* 256 character map */
1234   );
1235
1236/* --- UTF-7 Codecs ------------------------------------------------------- */
1237
1238PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
1239    const char *string,         /* UTF-7 encoded string */
1240    Py_ssize_t length,          /* size of string */
1241    const char *errors          /* error handling */
1242    );
1243
1244PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
1245    const char *string,         /* UTF-7 encoded string */
1246    Py_ssize_t length,          /* size of string */
1247    const char *errors,         /* error handling */
1248    Py_ssize_t *consumed        /* bytes consumed */
1249    );
1250
1251#ifndef Py_LIMITED_API
1252PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
1253    const Py_UNICODE *data,     /* Unicode char buffer */
1254    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1255    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1256    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1257    const char *errors          /* error handling */
1258    );
1259PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1260    PyObject *unicode,          /* Unicode object */
1261    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1262    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1263    const char *errors          /* error handling */
1264    );
1265#endif
1266
1267/* --- UTF-8 Codecs ------------------------------------------------------- */
1268
1269PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
1270    const char *string,         /* UTF-8 encoded string */
1271    Py_ssize_t length,          /* size of string */
1272    const char *errors          /* error handling */
1273    );
1274
1275PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
1276    const char *string,         /* UTF-8 encoded string */
1277    Py_ssize_t length,          /* size of string */
1278    const char *errors,         /* error handling */
1279    Py_ssize_t *consumed        /* bytes consumed */
1280    );
1281
1282PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
1283    PyObject *unicode           /* Unicode object */
1284    );
1285
1286#ifndef Py_LIMITED_API
1287PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1288    PyObject *unicode,
1289    const char *errors);
1290
1291PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
1292    const Py_UNICODE *data,     /* Unicode char buffer */
1293    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1294    const char *errors          /* error handling */
1295    );
1296#endif
1297
1298/* --- UTF-32 Codecs ------------------------------------------------------ */
1299
1300/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1301   the corresponding Unicode object.
1302
1303   errors (if non-NULL) defines the error handling. It defaults
1304   to "strict".
1305
1306   If byteorder is non-NULL, the decoder starts decoding using the
1307   given byte order:
1308
1309    *byteorder == -1: little endian
1310    *byteorder == 0:  native order
1311    *byteorder == 1:  big endian
1312
1313   In native mode, the first four bytes of the stream are checked for a
1314   BOM mark. If found, the BOM mark is analysed, the byte order
1315   adjusted and the BOM skipped.  In the other modes, no BOM mark
1316   interpretation is done. After completion, *byteorder is set to the
1317   current byte order at the end of input data.
1318
1319   If byteorder is NULL, the codec starts in native order mode.
1320
1321*/
1322
1323PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
1324    const char *string,         /* UTF-32 encoded string */
1325    Py_ssize_t length,          /* size of string */
1326    const char *errors,         /* error handling */
1327    int *byteorder              /* pointer to byteorder to use
1328                                   0=native;-1=LE,1=BE; updated on
1329                                   exit */
1330    );
1331
1332PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
1333    const char *string,         /* UTF-32 encoded string */
1334    Py_ssize_t length,          /* size of string */
1335    const char *errors,         /* error handling */
1336    int *byteorder,             /* pointer to byteorder to use
1337                                   0=native;-1=LE,1=BE; updated on
1338                                   exit */
1339    Py_ssize_t *consumed        /* bytes consumed */
1340    );
1341
1342/* Returns a Python string using the UTF-32 encoding in native byte
1343   order. The string always starts with a BOM mark.  */
1344
1345PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
1346    PyObject *unicode           /* Unicode object */
1347    );
1348
1349/* Returns a Python string object holding the UTF-32 encoded value of
1350   the Unicode data.
1351
1352   If byteorder is not 0, output is written according to the following
1353   byte order:
1354
1355   byteorder == -1: little endian
1356   byteorder == 0:  native byte order (writes a BOM mark)
1357   byteorder == 1:  big endian
1358
1359   If byteorder is 0, the output string will always start with the
1360   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1361   prepended.
1362
1363*/
1364
1365#ifndef Py_LIMITED_API
1366PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
1367    const Py_UNICODE *data,     /* Unicode char buffer */
1368    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1369    const char *errors,         /* error handling */
1370    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1371    );
1372PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1373    PyObject *object,           /* Unicode object */
1374    const char *errors,         /* error handling */
1375    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1376    );
1377#endif
1378
1379/* --- UTF-16 Codecs ------------------------------------------------------ */
1380
1381/* Decodes length bytes from a UTF-16 encoded buffer string and returns
1382   the corresponding Unicode object.
1383
1384   errors (if non-NULL) defines the error handling. It defaults
1385   to "strict".
1386
1387   If byteorder is non-NULL, the decoder starts decoding using the
1388   given byte order:
1389
1390    *byteorder == -1: little endian
1391    *byteorder == 0:  native order
1392    *byteorder == 1:  big endian
1393
1394   In native mode, the first two bytes of the stream are checked for a
1395   BOM mark. If found, the BOM mark is analysed, the byte order
1396   adjusted and the BOM skipped.  In the other modes, no BOM mark
1397   interpretation is done. After completion, *byteorder is set to the
1398   current byte order at the end of input data.
1399
1400   If byteorder is NULL, the codec starts in native order mode.
1401
1402*/
1403
1404PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
1405    const char *string,         /* UTF-16 encoded string */
1406    Py_ssize_t length,          /* size of string */
1407    const char *errors,         /* error handling */
1408    int *byteorder              /* pointer to byteorder to use
1409                                   0=native;-1=LE,1=BE; updated on
1410                                   exit */
1411    );
1412
1413PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
1414    const char *string,         /* UTF-16 encoded string */
1415    Py_ssize_t length,          /* size of string */
1416    const char *errors,         /* error handling */
1417    int *byteorder,             /* pointer to byteorder to use
1418                                   0=native;-1=LE,1=BE; updated on
1419                                   exit */
1420    Py_ssize_t *consumed        /* bytes consumed */
1421    );
1422
1423/* Returns a Python string using the UTF-16 encoding in native byte
1424   order. The string always starts with a BOM mark.  */
1425
1426PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
1427    PyObject *unicode           /* Unicode object */
1428    );
1429
1430/* Returns a Python string object holding the UTF-16 encoded value of
1431   the Unicode data.
1432
1433   If byteorder is not 0, output is written according to the following
1434   byte order:
1435
1436   byteorder == -1: little endian
1437   byteorder == 0:  native byte order (writes a BOM mark)
1438   byteorder == 1:  big endian
1439
1440   If byteorder is 0, the output string will always start with the
1441   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1442   prepended.
1443
1444   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1445   UCS-2. This trick makes it possible to add full UTF-16 capabilities
1446   at a later point without compromising the APIs.
1447
1448*/
1449
1450#ifndef Py_LIMITED_API
1451PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1452    const Py_UNICODE *data,     /* Unicode char buffer */
1453    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1454    const char *errors,         /* error handling */
1455    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1456    );
1457PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1458    PyObject* unicode,          /* Unicode object */
1459    const char *errors,         /* error handling */
1460    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1461    );
1462#endif
1463
1464/* --- Unicode-Escape Codecs ---------------------------------------------- */
1465
1466PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1467    const char *string,         /* Unicode-Escape encoded string */
1468    Py_ssize_t length,          /* size of string */
1469    const char *errors          /* error handling */
1470    );
1471
1472PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1473    PyObject *unicode           /* Unicode object */
1474    );
1475
1476#ifndef Py_LIMITED_API
1477PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1478    const Py_UNICODE *data,     /* Unicode char buffer */
1479    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1480    );
1481#endif
1482
1483/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1484
1485PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1486    const char *string,         /* Raw-Unicode-Escape encoded string */
1487    Py_ssize_t length,          /* size of string */
1488    const char *errors          /* error handling */
1489    );
1490
1491PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1492    PyObject *unicode           /* Unicode object */
1493    );
1494
1495#ifndef Py_LIMITED_API
1496PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1497    const Py_UNICODE *data,     /* Unicode char buffer */
1498    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1499    );
1500#endif
1501
1502/* --- Unicode Internal Codec ---------------------------------------------
1503
1504    Only for internal use in _codecsmodule.c */
1505
1506#ifndef Py_LIMITED_API
1507PyObject *_PyUnicode_DecodeUnicodeInternal(
1508    const char *string,
1509    Py_ssize_t length,
1510    const char *errors
1511    );
1512#endif
1513
1514/* --- Latin-1 Codecs -----------------------------------------------------
1515
1516   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1517
1518*/
1519
1520PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1521    const char *string,         /* Latin-1 encoded string */
1522    Py_ssize_t length,          /* size of string */
1523    const char *errors          /* error handling */
1524    );
1525
1526PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1527    PyObject *unicode           /* Unicode object */
1528    );
1529
1530#ifndef Py_LIMITED_API
1531PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1532    PyObject* unicode,
1533    const char* errors);
1534
1535PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1536    const Py_UNICODE *data,     /* Unicode char buffer */
1537    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1538    const char *errors          /* error handling */
1539    );
1540#endif
1541
1542/* --- ASCII Codecs -------------------------------------------------------
1543
1544   Only 7-bit ASCII data is excepted. All other codes generate errors.
1545
1546*/
1547
1548PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1549    const char *string,         /* ASCII encoded string */
1550    Py_ssize_t length,          /* size of string */
1551    const char *errors          /* error handling */
1552    );
1553
1554PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1555    PyObject *unicode           /* Unicode object */
1556    );
1557
1558#ifndef Py_LIMITED_API
1559PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1560    PyObject* unicode,
1561    const char* errors);
1562
1563PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1564    const Py_UNICODE *data,     /* Unicode char buffer */
1565    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1566    const char *errors          /* error handling */
1567    );
1568#endif
1569
1570/* --- Character Map Codecs -----------------------------------------------
1571
1572   This codec uses mappings to encode and decode characters.
1573
1574   Decoding mappings must map single string characters to single
1575   Unicode characters, integers (which are then interpreted as Unicode
1576   ordinals) or None (meaning "undefined mapping" and causing an
1577   error).
1578
1579   Encoding mappings must map single Unicode characters to single
1580   string characters, integers (which are then interpreted as Latin-1
1581   ordinals) or None (meaning "undefined mapping" and causing an
1582   error).
1583
1584   If a character lookup fails with a LookupError, the character is
1585   copied as-is meaning that its ordinal value will be interpreted as
1586   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1587   to contain those mappings which map characters to different code
1588   points.
1589
1590*/
1591
1592PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1593    const char *string,         /* Encoded string */
1594    Py_ssize_t length,          /* size of string */
1595    PyObject *mapping,          /* character mapping
1596                                   (char ordinal -> unicode ordinal) */
1597    const char *errors          /* error handling */
1598    );
1599
1600PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1601    PyObject *unicode,          /* Unicode object */
1602    PyObject *mapping           /* character mapping
1603                                   (unicode ordinal -> char ordinal) */
1604    );
1605
1606#ifndef Py_LIMITED_API
1607PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1608    const Py_UNICODE *data,     /* Unicode char buffer */
1609    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1610    PyObject *mapping,          /* character mapping
1611                                   (unicode ordinal -> char ordinal) */
1612    const char *errors          /* error handling */
1613    );
1614PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1615    PyObject *unicode,          /* Unicode object */
1616    PyObject *mapping,          /* character mapping
1617                                   (unicode ordinal -> char ordinal) */
1618    const char *errors          /* error handling */
1619    );
1620#endif
1621
1622/* Translate a Py_UNICODE buffer of the given length by applying a
1623   character mapping table to it and return the resulting Unicode
1624   object.
1625
1626   The mapping table must map Unicode ordinal integers to Unicode
1627   ordinal integers or None (causing deletion of the character).
1628
1629   Mapping tables may be dictionaries or sequences. Unmapped character
1630   ordinals (ones which cause a LookupError) are left untouched and
1631   are copied as-is.
1632
1633*/
1634
1635#ifndef Py_LIMITED_API
1636PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1637    const Py_UNICODE *data,     /* Unicode char buffer */
1638    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1639    PyObject *table,            /* Translate table */
1640    const char *errors          /* error handling */
1641    );
1642#endif
1643
1644#ifdef MS_WINDOWS
1645
1646/* --- MBCS codecs for Windows -------------------------------------------- */
1647
1648PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1649    const char *string,         /* MBCS encoded string */
1650    Py_ssize_t length,          /* size of string */
1651    const char *errors          /* error handling */
1652    );
1653
1654PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1655    const char *string,         /* MBCS encoded string */
1656    Py_ssize_t length,          /* size of string */
1657    const char *errors,         /* error handling */
1658    Py_ssize_t *consumed        /* bytes consumed */
1659    );
1660
1661PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1662    int code_page,              /* code page number */
1663    const char *string,         /* encoded string */
1664    Py_ssize_t length,          /* size of string */
1665    const char *errors,         /* error handling */
1666    Py_ssize_t *consumed        /* bytes consumed */
1667    );
1668
1669PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1670    PyObject *unicode           /* Unicode object */
1671    );
1672
1673#ifndef Py_LIMITED_API
1674PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1675    const Py_UNICODE *data,     /* Unicode char buffer */
1676    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1677    const char *errors          /* error handling */
1678    );
1679#endif
1680
1681PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1682    int code_page,              /* code page number */
1683    PyObject *unicode,          /* Unicode object */
1684    const char *errors          /* error handling */
1685    );
1686
1687#endif /* MS_WINDOWS */
1688
1689/* --- Decimal Encoder ---------------------------------------------------- */
1690
1691/* Takes a Unicode string holding a decimal value and writes it into
1692   an output buffer using standard ASCII digit codes.
1693
1694   The output buffer has to provide at least length+1 bytes of storage
1695   area. The output string is 0-terminated.
1696
1697   The encoder converts whitespace to ' ', decimal characters to their
1698   corresponding ASCII digit and all other Latin-1 characters except
1699   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1700   are treated as errors. This includes embedded NULL bytes.
1701
1702   Error handling is defined by the errors argument:
1703
1704      NULL or "strict": raise a ValueError
1705      "ignore": ignore the wrong characters (these are not copied to the
1706                output buffer)
1707      "replace": replaces illegal characters with '?'
1708
1709   Returns 0 on success, -1 on failure.
1710
1711*/
1712
1713#ifndef Py_LIMITED_API
1714PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1715    Py_UNICODE *s,              /* Unicode buffer */
1716    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1717    char *output,               /* Output buffer; must have size >= length */
1718    const char *errors          /* error handling */
1719    );
1720#endif
1721
1722/* Transforms code points that have decimal digit property to the
1723   corresponding ASCII digit code points.
1724
1725   Returns a new Unicode string on success, NULL on failure.
1726*/
1727
1728#ifndef Py_LIMITED_API
1729PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1730    Py_UNICODE *s,              /* Unicode buffer */
1731    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
1732    );
1733#endif
1734
1735/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
1736   as argument instead of a raw buffer and length.  This function additionally
1737   transforms spaces to ASCII because this is what the callers in longobject,
1738   floatobject, and complexobject did anyways. */
1739
1740#ifndef Py_LIMITED_API
1741PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1742    PyObject *unicode           /* Unicode object */
1743    );
1744#endif
1745
1746/* --- Locale encoding --------------------------------------------------- */
1747
1748/* Decode a string from the current locale encoding. The decoder is strict if
1749   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1750   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1751   be decoded as a surrogate character and *surrogateescape* is not equal to
1752   zero, the byte sequence is escaped using the 'surrogateescape' error handler
1753   instead of being decoded. *str* must end with a null character but cannot
1754   contain embedded null characters. */
1755
1756PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1757    const char *str,
1758    Py_ssize_t len,
1759    const char *errors);
1760
1761/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1762   length using strlen(). */
1763
1764PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1765    const char *str,
1766    const char *errors);
1767
1768/* Encode a Unicode object to the current locale encoding. The encoder is
1769   strict is *surrogateescape* is equal to zero, otherwise the
1770   "surrogateescape" error handler is used. Return a bytes object. The string
1771   cannot contain embedded null characters. */
1772
1773PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1774    PyObject *unicode,
1775    const char *errors
1776    );
1777
1778/* --- File system encoding ---------------------------------------------- */
1779
1780/* ParseTuple converter: encode str objects to bytes using
1781   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1782
1783PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1784
1785/* ParseTuple converter: decode bytes objects to unicode using
1786   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1787
1788PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1789
1790/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1791   and the "surrogateescape" error handler.
1792
1793   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1794   encoding.
1795
1796   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1797*/
1798
1799PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1800    const char *s               /* encoded string */
1801    );
1802
1803/* Decode a string using Py_FileSystemDefaultEncoding
1804   and the "surrogateescape" error handler.
1805
1806   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1807   encoding.
1808*/
1809
1810PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1811    const char *s,               /* encoded string */
1812    Py_ssize_t size              /* size */
1813    );
1814
1815/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1816   "surrogateescape" error handler, and return bytes.
1817
1818   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1819   encoding.
1820*/
1821
1822PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1823    PyObject *unicode
1824    );
1825
1826/* --- Methods & Slots ----------------------------------------------------
1827
1828   These are capable of handling Unicode objects and strings on input
1829   (we refer to them as strings in the descriptions) and return
1830   Unicode objects or integers as appropriate. */
1831
1832/* Concat two strings giving a new Unicode string. */
1833
1834PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1835    PyObject *left,             /* Left string */
1836    PyObject *right             /* Right string */
1837    );
1838
1839/* Concat two strings and put the result in *pleft
1840   (sets *pleft to NULL on error) */
1841
1842PyAPI_FUNC(void) PyUnicode_Append(
1843    PyObject **pleft,           /* Pointer to left string */
1844    PyObject *right             /* Right string */
1845    );
1846
1847/* Concat two strings, put the result in *pleft and drop the right object
1848   (sets *pleft to NULL on error) */
1849
1850PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1851    PyObject **pleft,           /* Pointer to left string */
1852    PyObject *right             /* Right string */
1853    );
1854
1855/* Split a string giving a list of Unicode strings.
1856
1857   If sep is NULL, splitting will be done at all whitespace
1858   substrings. Otherwise, splits occur at the given separator.
1859
1860   At most maxsplit splits will be done. If negative, no limit is set.
1861
1862   Separators are not included in the resulting list.
1863
1864*/
1865
1866PyAPI_FUNC(PyObject*) PyUnicode_Split(
1867    PyObject *s,                /* String to split */
1868    PyObject *sep,              /* String separator */
1869    Py_ssize_t maxsplit         /* Maxsplit count */
1870    );
1871
1872/* Dito, but split at line breaks.
1873
1874   CRLF is considered to be one line break. Line breaks are not
1875   included in the resulting list. */
1876
1877PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1878    PyObject *s,                /* String to split */
1879    int keepends                /* If true, line end markers are included */
1880    );
1881
1882/* Partition a string using a given separator. */
1883
1884PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1885    PyObject *s,                /* String to partition */
1886    PyObject *sep               /* String separator */
1887    );
1888
1889/* Partition a string using a given separator, searching from the end of the
1890   string. */
1891
1892PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1893    PyObject *s,                /* String to partition */
1894    PyObject *sep               /* String separator */
1895    );
1896
1897/* Split a string giving a list of Unicode strings.
1898
1899   If sep is NULL, splitting will be done at all whitespace
1900   substrings. Otherwise, splits occur at the given separator.
1901
1902   At most maxsplit splits will be done. But unlike PyUnicode_Split
1903   PyUnicode_RSplit splits from the end of the string. If negative,
1904   no limit is set.
1905
1906   Separators are not included in the resulting list.
1907
1908*/
1909
1910PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1911    PyObject *s,                /* String to split */
1912    PyObject *sep,              /* String separator */
1913    Py_ssize_t maxsplit         /* Maxsplit count */
1914    );
1915
1916/* Translate a string by applying a character mapping table to it and
1917   return the resulting Unicode object.
1918
1919   The mapping table must map Unicode ordinal integers to Unicode
1920   ordinal integers or None (causing deletion of the character).
1921
1922   Mapping tables may be dictionaries or sequences. Unmapped character
1923   ordinals (ones which cause a LookupError) are left untouched and
1924   are copied as-is.
1925
1926*/
1927
1928PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1929    PyObject *str,              /* String */
1930    PyObject *table,            /* Translate table */
1931    const char *errors          /* error handling */
1932    );
1933
1934/* Join a sequence of strings using the given separator and return
1935   the resulting Unicode string. */
1936
1937PyAPI_FUNC(PyObject*) PyUnicode_Join(
1938    PyObject *separator,        /* Separator string */
1939    PyObject *seq               /* Sequence object */
1940    );
1941
1942#ifndef Py_LIMITED_API
1943PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
1944    PyObject *separator,
1945    PyObject **items,
1946    Py_ssize_t seqlen
1947    );
1948#endif /* Py_LIMITED_API */
1949
1950/* Return 1 if substr matches str[start:end] at the given tail end, 0
1951   otherwise. */
1952
1953PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1954    PyObject *str,              /* String */
1955    PyObject *substr,           /* Prefix or Suffix string */
1956    Py_ssize_t start,           /* Start index */
1957    Py_ssize_t end,             /* Stop index */
1958    int direction               /* Tail end: -1 prefix, +1 suffix */
1959    );
1960
1961/* Return the first position of substr in str[start:end] using the
1962   given search direction or -1 if not found. -2 is returned in case
1963   an error occurred and an exception is set. */
1964
1965PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1966    PyObject *str,              /* String */
1967    PyObject *substr,           /* Substring to find */
1968    Py_ssize_t start,           /* Start index */
1969    Py_ssize_t end,             /* Stop index */
1970    int direction               /* Find direction: +1 forward, -1 backward */
1971    );
1972
1973/* Like PyUnicode_Find, but search for single character only. */
1974PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1975    PyObject *str,
1976    Py_UCS4 ch,
1977    Py_ssize_t start,
1978    Py_ssize_t end,
1979    int direction
1980    );
1981
1982/* Count the number of occurrences of substr in str[start:end]. */
1983
1984PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1985    PyObject *str,              /* String */
1986    PyObject *substr,           /* Substring to count */
1987    Py_ssize_t start,           /* Start index */
1988    Py_ssize_t end              /* Stop index */
1989    );
1990
1991/* Replace at most maxcount occurrences of substr in str with replstr
1992   and return the resulting Unicode object. */
1993
1994PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1995    PyObject *str,              /* String */
1996    PyObject *substr,           /* Substring to find */
1997    PyObject *replstr,          /* Substring to replace */
1998    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1999                                   -1 = all */
2000    );
2001
2002/* Compare two strings and return -1, 0, 1 for less than, equal,
2003   greater than resp.
2004   Raise an exception and return -1 on error. */
2005
2006PyAPI_FUNC(int) PyUnicode_Compare(
2007    PyObject *left,             /* Left string */
2008    PyObject *right             /* Right string */
2009    );
2010
2011#ifndef Py_LIMITED_API
2012PyAPI_FUNC(int) _PyUnicode_CompareWithId(
2013    PyObject *left,             /* Left string */
2014    _Py_Identifier *right       /* Right identifier */
2015    );
2016#endif
2017
2018PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
2019    PyObject *left,
2020    const char *right           /* ASCII-encoded string */
2021    );
2022
2023/* Rich compare two strings and return one of the following:
2024
2025   - NULL in case an exception was raised
2026   - Py_True or Py_False for successful comparisons
2027   - Py_NotImplemented in case the type combination is unknown
2028
2029   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
2030   case the conversion of the arguments to Unicode fails with a
2031   UnicodeDecodeError.
2032
2033   Possible values for op:
2034
2035     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
2036
2037*/
2038
2039PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
2040    PyObject *left,             /* Left string */
2041    PyObject *right,            /* Right string */
2042    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
2043    );
2044
2045/* Apply an argument tuple or dictionary to a format string and return
2046   the resulting Unicode string. */
2047
2048PyAPI_FUNC(PyObject *) PyUnicode_Format(
2049    PyObject *format,           /* Format string */
2050    PyObject *args              /* Argument tuple or dictionary */
2051    );
2052
2053/* Checks whether element is contained in container and return 1/0
2054   accordingly.
2055
2056   element has to coerce to a one element Unicode string. -1 is
2057   returned in case of an error. */
2058
2059PyAPI_FUNC(int) PyUnicode_Contains(
2060    PyObject *container,        /* Container string */
2061    PyObject *element           /* Element string */
2062    );
2063
2064/* Checks whether argument is a valid identifier. */
2065
2066PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2067
2068#ifndef Py_LIMITED_API
2069/* Externally visible for str.strip(unicode) */
2070PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
2071    PyObject *self,
2072    int striptype,
2073    PyObject *sepobj
2074    );
2075#endif
2076
2077/* Using explicit passed-in values, insert the thousands grouping
2078   into the string pointed to by buffer.  For the argument descriptions,
2079   see Objects/stringlib/localeutil.h */
2080#ifndef Py_LIMITED_API
2081PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
2082    PyObject *unicode,
2083    Py_ssize_t index,
2084    Py_ssize_t n_buffer,
2085    void *digits,
2086    Py_ssize_t n_digits,
2087    Py_ssize_t min_width,
2088    const char *grouping,
2089    PyObject *thousands_sep,
2090    Py_UCS4 *maxchar);
2091#endif
2092/* === Characters Type APIs =============================================== */
2093
2094/* Helper array used by Py_UNICODE_ISSPACE(). */
2095
2096#ifndef Py_LIMITED_API
2097PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2098
2099/* These should not be used directly. Use the Py_UNICODE_IS* and
2100   Py_UNICODE_TO* macros instead.
2101
2102   These APIs are implemented in Objects/unicodectype.c.
2103
2104*/
2105
2106PyAPI_FUNC(int) _PyUnicode_IsLowercase(
2107    Py_UCS4 ch       /* Unicode character */
2108    );
2109
2110PyAPI_FUNC(int) _PyUnicode_IsUppercase(
2111    Py_UCS4 ch       /* Unicode character */
2112    );
2113
2114PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
2115    Py_UCS4 ch       /* Unicode character */
2116    );
2117
2118PyAPI_FUNC(int) _PyUnicode_IsXidStart(
2119    Py_UCS4 ch       /* Unicode character */
2120    );
2121
2122PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
2123    Py_UCS4 ch       /* Unicode character */
2124    );
2125
2126PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
2127    const Py_UCS4 ch         /* Unicode character */
2128    );
2129
2130PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
2131    const Py_UCS4 ch         /* Unicode character */
2132    );
2133
2134PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2135    Py_UCS4 ch       /* Unicode character */
2136    );
2137
2138PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2139    Py_UCS4 ch       /* Unicode character */
2140    );
2141
2142PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2143    Py_UCS4 ch       /* Unicode character */
2144    );
2145
2146PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2147    Py_UCS4 ch,       /* Unicode character */
2148    Py_UCS4 *res
2149    );
2150
2151PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2152    Py_UCS4 ch,       /* Unicode character */
2153    Py_UCS4 *res
2154    );
2155
2156PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2157    Py_UCS4 ch,       /* Unicode character */
2158    Py_UCS4 *res
2159    );
2160
2161PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2162    Py_UCS4 ch,       /* Unicode character */
2163    Py_UCS4 *res
2164    );
2165
2166PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
2167    Py_UCS4 ch         /* Unicode character */
2168    );
2169
2170PyAPI_FUNC(int) _PyUnicode_IsCased(
2171    Py_UCS4 ch         /* Unicode character */
2172    );
2173
2174PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
2175    Py_UCS4 ch       /* Unicode character */
2176    );
2177
2178PyAPI_FUNC(int) _PyUnicode_ToDigit(
2179    Py_UCS4 ch       /* Unicode character */
2180    );
2181
2182PyAPI_FUNC(double) _PyUnicode_ToNumeric(
2183    Py_UCS4 ch       /* Unicode character */
2184    );
2185
2186PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
2187    Py_UCS4 ch       /* Unicode character */
2188    );
2189
2190PyAPI_FUNC(int) _PyUnicode_IsDigit(
2191    Py_UCS4 ch       /* Unicode character */
2192    );
2193
2194PyAPI_FUNC(int) _PyUnicode_IsNumeric(
2195    Py_UCS4 ch       /* Unicode character */
2196    );
2197
2198PyAPI_FUNC(int) _PyUnicode_IsPrintable(
2199    Py_UCS4 ch       /* Unicode character */
2200    );
2201
2202PyAPI_FUNC(int) _PyUnicode_IsAlpha(
2203    Py_UCS4 ch       /* Unicode character */
2204    );
2205
2206PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2207    const Py_UNICODE *u
2208    );
2209
2210PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
2211    Py_UNICODE *s1,
2212    const Py_UNICODE *s2);
2213
2214PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2215    Py_UNICODE *s1, const Py_UNICODE *s2);
2216
2217PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
2218    Py_UNICODE *s1,
2219    const Py_UNICODE *s2,
2220    size_t n);
2221
2222PyAPI_FUNC(int) Py_UNICODE_strcmp(
2223    const Py_UNICODE *s1,
2224    const Py_UNICODE *s2
2225    );
2226
2227PyAPI_FUNC(int) Py_UNICODE_strncmp(
2228    const Py_UNICODE *s1,
2229    const Py_UNICODE *s2,
2230    size_t n
2231    );
2232
2233PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
2234    const Py_UNICODE *s,
2235    Py_UNICODE c
2236    );
2237
2238PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
2239    const Py_UNICODE *s,
2240    Py_UNICODE c
2241    );
2242
2243PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
2244
2245/* Create a copy of a unicode string ending with a nul character. Return NULL
2246   and raise a MemoryError exception on memory allocation failure, otherwise
2247   return a new allocated buffer (use PyMem_Free() to free the buffer). */
2248
2249PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
2250    PyObject *unicode
2251    );
2252#endif /* Py_LIMITED_API */
2253
2254#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2255PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2256    PyObject *op,
2257    int check_content);
2258#endif
2259
2260#ifndef Py_LIMITED_API
2261/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2262PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2263/* Clear all static strings. */
2264PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2265
2266/* Fast equality check when the inputs are known to be exact unicode types
2267   and where the hash values are equal (i.e. a very probable match) */
2268PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
2269#endif /* !Py_LIMITED_API */
2270
2271#ifdef __cplusplus
2272}
2273#endif
2274#endif /* !Py_UNICODEOBJECT_H */
2275